🐛 Fix DeepSpeed image bug; Add all running configs

ZihaoZhou · ZihaoZhou · commit 3255c0f85498 · 2022-11-21T23:27:56.000-08:00
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
 
 ##############################################################################
 # Temporary Installation Directory
@@ -9,6 +9,8 @@ RUN mkdir -p ${STAGE_DIR}
 ##############################################################################
 # Installation/Basic Utilities
 ##############################################################################
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=America/Los_Angeles
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     software-properties-common build-essential autotools-dev \
@@ -17,7 +19,7 @@ RUN apt-get update && \
     curl wget vim tmux emacs less unzip \
     htop iftop iotop ca-certificates openssh-client openssh-server \
     rsync iputils-ping net-tools sudo \
-    llvm-14-dev
+    llvm-11-dev
 
 ##############################################################################
 # Installation Latest Git
@@ -41,11 +43,11 @@ RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
 ENV MLNX_OFED_VERSION=5.7-1.0.2.0
 RUN apt-get install -y libnuma-dev
 RUN cd ${STAGE_DIR} && \
-    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
-    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
+    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
+    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
     ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
     cd ${STAGE_DIR} && \
-    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
 
 ##############################################################################
 # nv_peer_mem
@@ -90,7 +92,6 @@ RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
 ##############################################################################
 # Python
 ##############################################################################
-ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHON_VERSION=3
 RUN apt-get install -y python3 python3-dev && \
     rm -f /usr/bin/python && \
@@ -169,11 +170,11 @@ RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
 # DeepSpeed
 ##############################################################################
 RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
-RUN pip install ninja
+RUN pip install triton==1.0.0
 RUN cd ${STAGE_DIR}/DeepSpeed && \
     git checkout . && \
     git checkout master && \
-    DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_TRANSFORMER_INFERENCE=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 DS_BUILD_AIO=1 DS_BUILD_CPU_ADAM=1 pip install .
+    DS_BUILD_OPS=1 pip install .
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN python -c "import deepspeed; print(deepspeed.__version__)" && ds_report
 
diff --git a/mnist/interactive.yaml b/mnist/interactive.yaml
@@ -0,0 +1,48 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: stpp
+spec:
+  containers:
+  - name: gpu-container
+    # torch113_cuda117_ds076
+    # image: docker.io/deepspeed/deepspeed:v072_torch112_cu117
+    image: docker.io/zihaokevinzhou/deepspeed:torch113_cuda117_ds076
+    imagePullPolicy: Always
+    command: ["sleep", "infinity"]
+    volumeMounts:
+    - mountPath: /stpp-vol
+      name: stpp-vol        
+    - mountPath: /dev/shm
+      name: dev-shm
+    resources:
+      limits:
+        nvidia.com/gpu: "1"
+        memory: "20G"
+        cpu: "12"
+      requests:
+        nvidia.com/gpu: "1"
+        memory: "8G"
+        cpu: "4"
+  restartPolicy: Never
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: nvidia.com/gpu.product
+            operator: In
+            values:
+            - NVIDIA-GeForce-RTX-3090
+          - key: kubernetes.io/hostname
+            operator: In
+            values:
+            - k8s-3090-02.clemson.edu
+  volumes:
+    - name: stpp-vol
+      persistentVolumeClaim:
+        claimName: stpp-vol
+    - name: dev-shm
+      emptyDir:
+        medium: Memory
+        sizeLimit: 1Gi
diff --git a/mnist/mnist.yaml b/mnist/mnist.yaml
@@ -0,0 +1,55 @@
+###############################
+# MNIST k8s SingleGPU Example #
+###############################
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: mnist
+  namespace: deep-forecast
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: docker.io/horovod/horovod:sha-811cf67
+        command: ["/bin/bash","-c"]
+        # NCCL_DEBUG=INFO
+        args: ["git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
+                cd nautilus_tutorial;
+                python mnist_ddp.py --batch-size 64 --epochs 10"]
+        volumeMounts:
+        - mountPath: /stpp-vol
+          name: stpp-vol
+        - mountPath: /dev/shm
+          name: dev-shm
+        resources:
+          limits:
+            nvidia.com/gpu: "4"
+            memory: "20G"
+            cpu: "12"
+          requests:
+            nvidia.com/gpu: "4"
+            memory: "8G"
+            cpu: "4"
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                - NVIDIA-GeForce-RTX-3090
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - k8s-3090-02.clemson.edu
+      volumes:
+        - name: stpp-vol
+          persistentVolumeClaim:
+            claimName: stpp-vol
+        - name: dev-shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
diff --git a/mnist/mnist_ddp.yaml b/mnist/mnist_ddp.yaml
@@ -0,0 +1,55 @@
+################################
+# MNIST k8s DDP(Naive) Example #
+################################
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: mnist
+  namespace: deep-forecast
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: horovod/horovod:sha-811cf67
+        command: ["/bin/bash","-c"]
+        # NCCL_DEBUG=INFO
+        args: ["git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
+                cd nautilus_tutorial;
+                torchrun --nproc_per_node=4 mnist_ddp.py --batch-size 64 --epochs 10"]
+        volumeMounts:
+        - mountPath: /stpp-vol
+          name: stpp-vol
+        - mountPath: /dev/shm
+          name: dev-shm
+        resources:
+          limits:
+            nvidia.com/gpu: "4"
+            memory: "20G"
+            cpu: "12"
+          requests:
+            nvidia.com/gpu: "4"
+            memory: "8G"
+            cpu: "4"
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                - NVIDIA-GeForce-RTX-3090
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - k8s-3090-02.clemson.edu
+      volumes:
+        - name: stpp-vol
+          persistentVolumeClaim:
+            claimName: stpp-vol
+        - name: dev-shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
diff --git a/mnist/mnist_ds.yaml b/mnist/mnist_ds.yaml
@@ -0,0 +1,55 @@
+###############################
+# MNIST k8s DeepSpeed Example #
+###############################
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: mnist
+  namespace: deep-forecast
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: docker.io/zihaokevinzhou/deepspeed:torch113_cuda117_ds076
+        command: ["/bin/bash","-c"]
+        # NCCL_DEBUG=INFO
+        args: ['git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
+                cd nautilus_tutorial;
+                deepspeed mnist_ds.py --deepspeed --deepspeed_config config.json --epochs 10']
+        volumeMounts:
+        - mountPath: /stpp-vol
+          name: stpp-vol
+        - mountPath: /dev/shm
+          name: dev-shm
+        resources:
+          limits:
+            nvidia.com/gpu: "4"
+            memory: "20G"
+            cpu: "12"
+          requests:
+            nvidia.com/gpu: "4"
+            memory: "8G"
+            cpu: "4"
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                - NVIDIA-GeForce-RTX-3090
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - k8s-3090-02.clemson.edu
+      volumes:
+        - name: stpp-vol
+          persistentVolumeClaim:
+            claimName: stpp-vol
+        - name: dev-shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
diff --git a/mnist/mnist_hf.yaml b/mnist/mnist_hf.yaml
@@ -0,0 +1,57 @@
+#################################
+# MNIST k8s HuggingFace Example #
+#################################
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: mnist
+  namespace: deep-forecast
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: horovod/horovod:sha-811cf67
+        command: ["/bin/bash","-c"]
+        # NCCL_DEBUG=INFO
+        args: ['git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
+                cd nautilus_tutorial;
+                pip install accelerate;
+                python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision=\"fp16\")";
+                accelerate launch mnist_hf.py --epochs 10']
+        volumeMounts:
+        - mountPath: /stpp-vol
+          name: stpp-vol
+        - mountPath: /dev/shm
+          name: dev-shm
+        resources:
+          limits:
+            nvidia.com/gpu: "4"
+            memory: "20G"
+            cpu: "12"
+          requests:
+            nvidia.com/gpu: "4"
+            memory: "8G"
+            cpu: "4"
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                - NVIDIA-GeForce-RTX-3090
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - k8s-3090-02.clemson.edu
+      volumes:
+        - name: stpp-vol
+          persistentVolumeClaim:
+            claimName: stpp-vol
+        - name: dev-shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
diff --git a/mnist/mnist_hvd.yaml b/mnist/mnist_hvd.yaml
@@ -0,0 +1,55 @@
+#############################
+# MNIST k8s Hovorod Example #
+#############################
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: mnist
+  namespace: deep-forecast
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: docker.io/horovod/horovod:sha-811cf67
+        command: ["/bin/bash","-c"]
+        # NCCL_DEBUG=INFO
+        args: ["git clone https://github.com/Rose-STL-Lab/nautilus_tutorial.git;
+                cd nautilus_tutorial;
+                horovodrun -np 4 -H localhost:4 python mnist_hvd.py --batch-size 64 --epochs 10"]
+        volumeMounts:
+        - mountPath: /stpp-vol
+          name: stpp-vol
+        - mountPath: /dev/shm
+          name: dev-shm
+        resources:
+          limits:
+            nvidia.com/gpu: "4"
+            memory: "20G"
+            cpu: "12"
+          requests:
+            nvidia.com/gpu: "4"
+            memory: "8G"
+            cpu: "4"
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                - NVIDIA-GeForce-RTX-3090
+              - key: kubernetes.io/hostname
+                operator: In
+                values:
+                - k8s-3090-02.clemson.edu
+      volumes:
+        - name: stpp-vol
+          persistentVolumeClaim:
+            claimName: stpp-vol
+        - name: dev-shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi