Skip to content

Commit 3255c0f

Browse files
committed
🐛 Fix DeepSpeed image bug; Add all running configs
1 parent 46f6c16 commit 3255c0f

7 files changed

Lines changed: 334 additions & 8 deletions

File tree

Dockerfile

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
1+
FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
22

33
##############################################################################
44
# Temporary Installation Directory
@@ -9,6 +9,8 @@ RUN mkdir -p ${STAGE_DIR}
99
##############################################################################
1010
# Installation/Basic Utilities
1111
##############################################################################
12+
ENV DEBIAN_FRONTEND=noninteractive
13+
ENV TZ=America/Los_Angeles
1214
RUN apt-get update && \
1315
apt-get install -y --no-install-recommends \
1416
software-properties-common build-essential autotools-dev \
@@ -17,7 +19,7 @@ RUN apt-get update && \
1719
curl wget vim tmux emacs less unzip \
1820
htop iftop iotop ca-certificates openssh-client openssh-server \
1921
rsync iputils-ping net-tools sudo \
20-
llvm-14-dev
22+
llvm-11-dev
2123

2224
##############################################################################
2325
# Installation Latest Git
@@ -41,11 +43,11 @@ RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
4143
ENV MLNX_OFED_VERSION=5.7-1.0.2.0
4244
RUN apt-get install -y libnuma-dev
4345
RUN cd ${STAGE_DIR} && \
44-
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
45-
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
46+
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
47+
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
4648
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
4749
cd ${STAGE_DIR} && \
48-
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
50+
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
4951

5052
##############################################################################
5153
# nv_peer_mem
@@ -90,7 +92,6 @@ RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
9092
##############################################################################
9193
# Python
9294
##############################################################################
93-
ENV DEBIAN_FRONTEND=noninteractive
9495
ENV PYTHON_VERSION=3
9596
RUN apt-get install -y python3 python3-dev && \
9697
rm -f /usr/bin/python && \
@@ -169,11 +170,11 @@ RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
169170
# DeepSpeed
170171
##############################################################################
171172
RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
172-
RUN pip install ninja
173+
RUN pip install triton==1.0.0
173174
RUN cd ${STAGE_DIR}/DeepSpeed && \
174175
git checkout . && \
175176
git checkout master && \
176-
DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_TRANSFORMER_INFERENCE=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 DS_BUILD_AIO=1 DS_BUILD_CPU_ADAM=1 pip install .
177+
DS_BUILD_OPS=1 pip install .
177178
RUN rm -rf ${STAGE_DIR}/DeepSpeed
178179
RUN python -c "import deepspeed; print(deepspeed.__version__)" && ds_report
179180

mnist/interactive.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: stpp
5+
spec:
6+
containers:
7+
- name: gpu-container
8+
# torch113_cuda117_ds076
9+
# image: docker.io/deepspeed/deepspeed:v072_torch112_cu117
10+
image: docker.io/zihaokevinzhou/deepspeed:torch113_cuda117_ds076
11+
imagePullPolicy: Always
12+
command: ["sleep", "infinity"]
13+
volumeMounts:
14+
- mountPath: /stpp-vol
15+
name: stpp-vol
16+
- mountPath: /dev/shm
17+
name: dev-shm
18+
resources:
19+
limits:
20+
nvidia.com/gpu: "1"
21+
memory: "20G"
22+
cpu: "12"
23+
requests:
24+
nvidia.com/gpu: "1"
25+
memory: "8G"
26+
cpu: "4"
27+
restartPolicy: Never
28+
affinity:
29+
nodeAffinity:
30+
requiredDuringSchedulingIgnoredDuringExecution:
31+
nodeSelectorTerms:
32+
- matchExpressions:
33+
- key: nvidia.com/gpu.product
34+
operator: In
35+
values:
36+
- NVIDIA-GeForce-RTX-3090
37+
- key: kubernetes.io/hostname
38+
operator: In
39+
values:
40+
- k8s-3090-02.clemson.edu
41+
volumes:
42+
- name: stpp-vol
43+
persistentVolumeClaim:
44+
claimName: stpp-vol
45+
- name: dev-shm
46+
emptyDir:
47+
medium: Memory
48+
sizeLimit: 1Gi

mnist/mnist.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
###############################
2+
# MNIST k8s SingleGPU Example #
3+
###############################
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: mnist
8+
namespace: deep-forecast
9+
spec:
10+
template:
11+
spec:
12+
containers:
13+
- name: gpu-container
14+
image: docker.io/horovod/horovod:sha-811cf67
15+
command: ["/bin/bash","-c"]
16+
# NCCL_DEBUG=INFO
17+
args: ["git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
18+
cd nautilus_tutorial;
19+
python mnist_ddp.py --batch-size 64 --epochs 10"]
20+
volumeMounts:
21+
- mountPath: /stpp-vol
22+
name: stpp-vol
23+
- mountPath: /dev/shm
24+
name: dev-shm
25+
resources:
26+
limits:
27+
nvidia.com/gpu: "4"
28+
memory: "20G"
29+
cpu: "12"
30+
requests:
31+
nvidia.com/gpu: "4"
32+
memory: "8G"
33+
cpu: "4"
34+
restartPolicy: Never
35+
affinity:
36+
nodeAffinity:
37+
requiredDuringSchedulingIgnoredDuringExecution:
38+
nodeSelectorTerms:
39+
- matchExpressions:
40+
- key: nvidia.com/gpu.product
41+
operator: In
42+
values:
43+
- NVIDIA-GeForce-RTX-3090
44+
- key: kubernetes.io/hostname
45+
operator: In
46+
values:
47+
- k8s-3090-02.clemson.edu
48+
volumes:
49+
- name: stpp-vol
50+
persistentVolumeClaim:
51+
claimName: stpp-vol
52+
- name: dev-shm
53+
emptyDir:
54+
medium: Memory
55+
sizeLimit: 1Gi

mnist/mnist_ddp.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
################################
2+
# MNIST k8s DDP(Naive) Example #
3+
################################
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: mnist
8+
namespace: deep-forecast
9+
spec:
10+
template:
11+
spec:
12+
containers:
13+
- name: gpu-container
14+
image: horovod/horovod:sha-811cf67
15+
command: ["/bin/bash","-c"]
16+
# NCCL_DEBUG=INFO
17+
args: ["git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
18+
cd nautilus_tutorial;
19+
torchrun --nproc_per_node=4 mnist_ddp.py --batch-size 64 --epochs 10"]
20+
volumeMounts:
21+
- mountPath: /stpp-vol
22+
name: stpp-vol
23+
- mountPath: /dev/shm
24+
name: dev-shm
25+
resources:
26+
limits:
27+
nvidia.com/gpu: "4"
28+
memory: "20G"
29+
cpu: "12"
30+
requests:
31+
nvidia.com/gpu: "4"
32+
memory: "8G"
33+
cpu: "4"
34+
restartPolicy: Never
35+
affinity:
36+
nodeAffinity:
37+
requiredDuringSchedulingIgnoredDuringExecution:
38+
nodeSelectorTerms:
39+
- matchExpressions:
40+
- key: nvidia.com/gpu.product
41+
operator: In
42+
values:
43+
- NVIDIA-GeForce-RTX-3090
44+
- key: kubernetes.io/hostname
45+
operator: In
46+
values:
47+
- k8s-3090-02.clemson.edu
48+
volumes:
49+
- name: stpp-vol
50+
persistentVolumeClaim:
51+
claimName: stpp-vol
52+
- name: dev-shm
53+
emptyDir:
54+
medium: Memory
55+
sizeLimit: 1Gi

mnist/mnist_ds.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
###############################
2+
# MNIST k8s DeepSpeed Example #
3+
###############################
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: mnist
8+
namespace: deep-forecast
9+
spec:
10+
template:
11+
spec:
12+
containers:
13+
- name: gpu-container
14+
image: docker.io/zihaokevinzhou/deepspeed:torch113_cuda117_ds076
15+
command: ["/bin/bash","-c"]
16+
# NCCL_DEBUG=INFO
17+
args: ['git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
18+
cd nautilus_tutorial;
19+
deepspeed mnist_ds.py --deepspeed --deepspeed_config config.json --epochs 10']
20+
volumeMounts:
21+
- mountPath: /stpp-vol
22+
name: stpp-vol
23+
- mountPath: /dev/shm
24+
name: dev-shm
25+
resources:
26+
limits:
27+
nvidia.com/gpu: "4"
28+
memory: "20G"
29+
cpu: "12"
30+
requests:
31+
nvidia.com/gpu: "4"
32+
memory: "8G"
33+
cpu: "4"
34+
restartPolicy: Never
35+
affinity:
36+
nodeAffinity:
37+
requiredDuringSchedulingIgnoredDuringExecution:
38+
nodeSelectorTerms:
39+
- matchExpressions:
40+
- key: nvidia.com/gpu.product
41+
operator: In
42+
values:
43+
- NVIDIA-GeForce-RTX-3090
44+
- key: kubernetes.io/hostname
45+
operator: In
46+
values:
47+
- k8s-3090-02.clemson.edu
48+
volumes:
49+
- name: stpp-vol
50+
persistentVolumeClaim:
51+
claimName: stpp-vol
52+
- name: dev-shm
53+
emptyDir:
54+
medium: Memory
55+
sizeLimit: 1Gi

mnist/mnist_hf.yaml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#################################
2+
# MNIST k8s HuggingFace Example #
3+
#################################
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: mnist
8+
namespace: deep-forecast
9+
spec:
10+
template:
11+
spec:
12+
containers:
13+
- name: gpu-container
14+
image: horovod/horovod:sha-811cf67
15+
command: ["/bin/bash","-c"]
16+
# NCCL_DEBUG=INFO
17+
args: ['git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
18+
cd nautilus_tutorial;
19+
pip install accelerate;
20+
python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision=\"fp16\")";
21+
accelerate launch mnist_hf.py --epochs 10']
22+
volumeMounts:
23+
- mountPath: /stpp-vol
24+
name: stpp-vol
25+
- mountPath: /dev/shm
26+
name: dev-shm
27+
resources:
28+
limits:
29+
nvidia.com/gpu: "4"
30+
memory: "20G"
31+
cpu: "12"
32+
requests:
33+
nvidia.com/gpu: "4"
34+
memory: "8G"
35+
cpu: "4"
36+
restartPolicy: Never
37+
affinity:
38+
nodeAffinity:
39+
requiredDuringSchedulingIgnoredDuringExecution:
40+
nodeSelectorTerms:
41+
- matchExpressions:
42+
- key: nvidia.com/gpu.product
43+
operator: In
44+
values:
45+
- NVIDIA-GeForce-RTX-3090
46+
- key: kubernetes.io/hostname
47+
operator: In
48+
values:
49+
- k8s-3090-02.clemson.edu
50+
volumes:
51+
- name: stpp-vol
52+
persistentVolumeClaim:
53+
claimName: stpp-vol
54+
- name: dev-shm
55+
emptyDir:
56+
medium: Memory
57+
sizeLimit: 1Gi

mnist/mnist_hvd.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#############################
2+
# MNIST k8s Hovorod Example #
3+
#############################
4+
apiVersion: batch/v1
5+
kind: Job
6+
metadata:
7+
name: mnist
8+
namespace: deep-forecast
9+
spec:
10+
template:
11+
spec:
12+
containers:
13+
- name: gpu-container
14+
image: docker.io/horovod/horovod:sha-811cf67
15+
command: ["/bin/bash","-c"]
16+
# NCCL_DEBUG=INFO
17+
args: ["git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
18+
cd nautilus_tutorial;
19+
horovodrun -np 4 -H localhost:4 python mnist_hvd.py --batch-size 64 --epochs 10"]
20+
volumeMounts:
21+
- mountPath: /stpp-vol
22+
name: stpp-vol
23+
- mountPath: /dev/shm
24+
name: dev-shm
25+
resources:
26+
limits:
27+
nvidia.com/gpu: "4"
28+
memory: "20G"
29+
cpu: "12"
30+
requests:
31+
nvidia.com/gpu: "4"
32+
memory: "8G"
33+
cpu: "4"
34+
restartPolicy: Never
35+
affinity:
36+
nodeAffinity:
37+
requiredDuringSchedulingIgnoredDuringExecution:
38+
nodeSelectorTerms:
39+
- matchExpressions:
40+
- key: nvidia.com/gpu.product
41+
operator: In
42+
values:
43+
- NVIDIA-GeForce-RTX-3090
44+
- key: kubernetes.io/hostname
45+
operator: In
46+
values:
47+
- k8s-3090-02.clemson.edu
48+
volumes:
49+
- name: stpp-vol
50+
persistentVolumeClaim:
51+
claimName: stpp-vol
52+
- name: dev-shm
53+
emptyDir:
54+
medium: Memory
55+
sizeLimit: 1Gi

0 commit comments

Comments
 (0)