|
| 1 | +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 |
| 2 | + |
| 3 | +############################################################################## |
| 4 | +# Temporary Installation Directory |
| 5 | +############################################################################## |
| 6 | +ENV STAGE_DIR=/tmp |
| 7 | +RUN mkdir -p ${STAGE_DIR} |
| 8 | + |
| 9 | +############################################################################## |
| 10 | +# Installation/Basic Utilities |
| 11 | +############################################################################## |
| 12 | +RUN apt-get update && \ |
| 13 | + apt-get install -y --no-install-recommends \ |
| 14 | + software-properties-common build-essential autotools-dev \ |
| 15 | + nfs-common pdsh \ |
| 16 | + cmake g++ gcc \ |
| 17 | + curl wget vim tmux emacs less unzip \ |
| 18 | + htop iftop iotop ca-certificates openssh-client openssh-server \ |
| 19 | + rsync iputils-ping net-tools sudo \ |
| 20 | + llvm-14-dev |
| 21 | + |
| 22 | +############################################################################## |
| 23 | +# Installation Latest Git |
| 24 | +############################################################################## |
| 25 | +RUN add-apt-repository ppa:git-core/ppa -y && \ |
| 26 | + apt-get update && \ |
| 27 | + apt-get install -y git && \ |
| 28 | + git --version |
| 29 | + |
| 30 | +############################################################################## |
| 31 | +# Client Liveness & Uncomment Port 22 for SSH Daemon |
| 32 | +############################################################################## |
| 33 | +# Keep SSH client alive from server side |
| 34 | +RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config |
| 35 | +RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \ |
| 36 | + sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config |
| 37 | + |
| 38 | +############################################################################## |
| 39 | +# Mellanox OFED |
| 40 | +############################################################################## |
| 41 | +ENV MLNX_OFED_VERSION=5.7-1.0.2.0 |
| 42 | +RUN apt-get install -y libnuma-dev |
| 43 | +RUN cd ${STAGE_DIR} && \ |
| 44 | + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \ |
| 45 | + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \ |
| 46 | + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ |
| 47 | + cd ${STAGE_DIR} && \ |
| 48 | + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* |
| 49 | + |
| 50 | +############################################################################## |
| 51 | +# nv_peer_mem |
| 52 | +############################################################################## |
| 53 | +ENV NV_PEER_MEM_VERSION=1.3 |
| 54 | +ENV NV_PEER_MEM_TAG=1.3-0 |
| 55 | +RUN mkdir -p ${STAGE_DIR} && \ |
| 56 | + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ |
| 57 | + cd ${STAGE_DIR}/nv_peer_memory && \ |
| 58 | + ./build_module.sh && \ |
| 59 | + cd ${STAGE_DIR} && \ |
| 60 | + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ |
| 61 | + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ |
| 62 | + apt-get update && \ |
| 63 | + apt-get install -y dkms && \ |
| 64 | + dpkg-buildpackage -us -uc && \ |
| 65 | + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_1.2-0_all.deb |
| 66 | + |
| 67 | +############################################################################## |
| 68 | +# OPENMPI |
| 69 | +############################################################################## |
| 70 | +ENV OPENMPI_BASEVERSION=4.1 |
| 71 | +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.4 |
| 72 | +RUN cd ${STAGE_DIR} && \ |
| 73 | + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ |
| 74 | + cd openmpi-${OPENMPI_VERSION} && \ |
| 75 | + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ |
| 76 | + make -j"$(nproc)" install && \ |
| 77 | + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ |
| 78 | + # Sanity check: |
| 79 | + test -f /usr/local/mpi/bin/mpic++ && \ |
| 80 | + cd ${STAGE_DIR} && \ |
| 81 | + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} |
| 82 | +ENV PATH=/usr/local/mpi/bin:${PATH} \ |
| 83 | + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} |
| 84 | +# Create a wrapper for OpenMPI to allow running as root by default |
| 85 | +RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ |
| 86 | + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ |
| 87 | + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ |
| 88 | + chmod a+x /usr/local/mpi/bin/mpirun |
| 89 | + |
| 90 | +############################################################################## |
| 91 | +# Python |
| 92 | +############################################################################## |
| 93 | +ENV DEBIAN_FRONTEND=noninteractive |
| 94 | +ENV PYTHON_VERSION=3 |
| 95 | +RUN apt-get install -y python3 python3-dev && \ |
| 96 | + rm -f /usr/bin/python && \ |
| 97 | + ln -s /usr/bin/python3 /usr/bin/python && \ |
| 98 | + curl -O https://bootstrap.pypa.io/get-pip.py && \ |
| 99 | + python get-pip.py && \ |
| 100 | + rm get-pip.py && \ |
| 101 | + pip install --upgrade pip && \ |
| 102 | + # Print python an pip version |
| 103 | + python -V && pip -V |
| 104 | +RUN pip install pyyaml |
| 105 | +RUN pip install ipython |
| 106 | + |
| 107 | +############################################################################## |
| 108 | +# Some Packages |
| 109 | +############################################################################## |
| 110 | +RUN apt-get update && \ |
| 111 | + apt-get install -y --no-install-recommends \ |
| 112 | + libsndfile-dev \ |
| 113 | + libcupti-dev \ |
| 114 | + libjpeg-dev \ |
| 115 | + libpng-dev \ |
| 116 | + screen \ |
| 117 | + libaio-dev |
| 118 | +RUN pip install psutil \ |
| 119 | + yappi \ |
| 120 | + cffi \ |
| 121 | + ipdb \ |
| 122 | + pandas \ |
| 123 | + matplotlib \ |
| 124 | + py3nvml \ |
| 125 | + pyarrow \ |
| 126 | + graphviz \ |
| 127 | + astor \ |
| 128 | + boto3 \ |
| 129 | + tqdm \ |
| 130 | + sentencepiece \ |
| 131 | + msgpack \ |
| 132 | + requests \ |
| 133 | + pandas \ |
| 134 | + sphinx \ |
| 135 | + sphinx_rtd_theme \ |
| 136 | + scipy \ |
| 137 | + numpy \ |
| 138 | + sklearn \ |
| 139 | + scikit-learn \ |
| 140 | + nvidia-ml-py3 \ |
| 141 | + mpi4py \ |
| 142 | + cupy-cuda11x |
| 143 | + |
| 144 | +############################################################################## |
| 145 | +## SSH daemon port inside container cannot conflict with host OS port |
| 146 | +############################################################################### |
| 147 | +ENV SSH_PORT=2222 |
| 148 | +RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ |
| 149 | + sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config |
| 150 | + |
| 151 | +############################################################################## |
| 152 | +# PyTorch |
| 153 | +############################################################################## |
| 154 | +ENV PYTORCH_VERSION=1.13.0 |
| 155 | +ENV TORCHVISION_VERSION=0.14.0 |
| 156 | +ENV TENSORBOARDX_VERSION=2.5 |
| 157 | +RUN pip install torch==${PYTORCH_VERSION} |
| 158 | +RUN pip install torchvision==${TORCHVISION_VERSION} |
| 159 | +RUN pip install tensorboardX==${TENSORBOARDX_VERSION} |
| 160 | + |
| 161 | +############################################################################## |
| 162 | +# PyYAML build issue |
| 163 | +# https://stackoverflow.com/a/53926898 |
| 164 | +############################################################################## |
| 165 | +RUN rm -rf /usr/lib/python3/dist-packages/yaml && \ |
| 166 | + rm -rf /usr/lib/python3/dist-packages/PyYAML-* |
| 167 | + |
| 168 | +############################################################################## |
| 169 | +# DeepSpeed |
| 170 | +############################################################################## |
| 171 | +RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed |
| 172 | +RUN pip install ninja |
| 173 | +RUN cd ${STAGE_DIR}/DeepSpeed && \ |
| 174 | + git checkout . && \ |
| 175 | + git checkout master && \ |
| 176 | + DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_TRANSFORMER_INFERENCE=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 DS_BUILD_AIO=1 DS_BUILD_CPU_ADAM=1 pip install . |
| 177 | +RUN rm -rf ${STAGE_DIR}/DeepSpeed |
| 178 | +RUN python -c "import deepspeed; print(deepspeed.__version__)" && ds_report |
| 179 | + |
| 180 | +WORKDIR /root |
0 commit comments