Skip to content

Commit 46f6c16

Browse files
committed
Merge remote-tracking branch 'origin/main'
2 parents 9236fa2 + 2635b21 commit 46f6c16

2 files changed

Lines changed: 181 additions & 8 deletions

File tree

Dockerfile

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,176 @@ FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
55
##############################################################################
66
ENV STAGE_DIR=/tmp
77
RUN mkdir -p ${STAGE_DIR}
8+
9+
##############################################################################
10+
# Installation/Basic Utilities
11+
##############################################################################
12+
RUN apt-get update && \
13+
apt-get install -y --no-install-recommends \
14+
software-properties-common build-essential autotools-dev \
15+
nfs-common pdsh \
16+
cmake g++ gcc \
17+
curl wget vim tmux emacs less unzip \
18+
htop iftop iotop ca-certificates openssh-client openssh-server \
19+
rsync iputils-ping net-tools sudo \
20+
llvm-14-dev
21+
22+
##############################################################################
23+
# Installation Latest Git
24+
##############################################################################
25+
RUN add-apt-repository ppa:git-core/ppa -y && \
26+
apt-get update && \
27+
apt-get install -y git && \
28+
git --version
29+
30+
##############################################################################
31+
# Client Liveness & Uncomment Port 22 for SSH Daemon
32+
##############################################################################
33+
# Keep SSH client alive from server side
34+
RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
35+
RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
36+
sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
37+
38+
##############################################################################
39+
# Mellanox OFED
40+
##############################################################################
41+
ENV MLNX_OFED_VERSION=5.7-1.0.2.0
42+
RUN apt-get install -y libnuma-dev
43+
RUN cd ${STAGE_DIR} && \
44+
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
45+
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
46+
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
47+
cd ${STAGE_DIR} && \
48+
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
49+
50+
##############################################################################
51+
# nv_peer_mem
52+
##############################################################################
53+
ENV NV_PEER_MEM_VERSION=1.3
54+
ENV NV_PEER_MEM_TAG=1.3-0
55+
RUN mkdir -p ${STAGE_DIR} && \
56+
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
57+
cd ${STAGE_DIR}/nv_peer_memory && \
58+
./build_module.sh && \
59+
cd ${STAGE_DIR} && \
60+
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
61+
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
62+
apt-get update && \
63+
apt-get install -y dkms && \
64+
dpkg-buildpackage -us -uc && \
65+
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_1.2-0_all.deb
66+
67+
##############################################################################
68+
# OPENMPI
69+
##############################################################################
70+
ENV OPENMPI_BASEVERSION=4.1
71+
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.4
72+
RUN cd ${STAGE_DIR} && \
73+
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
74+
cd openmpi-${OPENMPI_VERSION} && \
75+
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
76+
make -j"$(nproc)" install && \
77+
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
78+
# Sanity check:
79+
test -f /usr/local/mpi/bin/mpic++ && \
80+
cd ${STAGE_DIR} && \
81+
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
82+
ENV PATH=/usr/local/mpi/bin:${PATH} \
83+
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
84+
# Create a wrapper for OpenMPI to allow running as root by default
85+
RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
86+
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
87+
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
88+
chmod a+x /usr/local/mpi/bin/mpirun
89+
90+
##############################################################################
91+
# Python
92+
##############################################################################
93+
ENV DEBIAN_FRONTEND=noninteractive
94+
ENV PYTHON_VERSION=3
95+
RUN apt-get install -y python3 python3-dev && \
96+
rm -f /usr/bin/python && \
97+
ln -s /usr/bin/python3 /usr/bin/python && \
98+
curl -O https://bootstrap.pypa.io/get-pip.py && \
99+
python get-pip.py && \
100+
rm get-pip.py && \
101+
pip install --upgrade pip && \
102+
# Print python an pip version
103+
python -V && pip -V
104+
RUN pip install pyyaml
105+
RUN pip install ipython
106+
107+
##############################################################################
108+
# Some Packages
109+
##############################################################################
110+
RUN apt-get update && \
111+
apt-get install -y --no-install-recommends \
112+
libsndfile-dev \
113+
libcupti-dev \
114+
libjpeg-dev \
115+
libpng-dev \
116+
screen \
117+
libaio-dev
118+
RUN pip install psutil \
119+
yappi \
120+
cffi \
121+
ipdb \
122+
pandas \
123+
matplotlib \
124+
py3nvml \
125+
pyarrow \
126+
graphviz \
127+
astor \
128+
boto3 \
129+
tqdm \
130+
sentencepiece \
131+
msgpack \
132+
requests \
133+
pandas \
134+
sphinx \
135+
sphinx_rtd_theme \
136+
scipy \
137+
numpy \
138+
sklearn \
139+
scikit-learn \
140+
nvidia-ml-py3 \
141+
mpi4py \
142+
cupy-cuda11x
143+
144+
##############################################################################
145+
## SSH daemon port inside container cannot conflict with host OS port
146+
###############################################################################
147+
ENV SSH_PORT=2222
148+
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
149+
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
150+
151+
##############################################################################
152+
# PyTorch
153+
##############################################################################
154+
ENV PYTORCH_VERSION=1.13.0
155+
ENV TORCHVISION_VERSION=0.14.0
156+
ENV TENSORBOARDX_VERSION=2.5
157+
RUN pip install torch==${PYTORCH_VERSION}
158+
RUN pip install torchvision==${TORCHVISION_VERSION}
159+
RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
160+
161+
##############################################################################
162+
# PyYAML build issue
163+
# https://stackoverflow.com/a/53926898
164+
##############################################################################
165+
RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
166+
rm -rf /usr/lib/python3/dist-packages/PyYAML-*
167+
168+
##############################################################################
169+
# DeepSpeed
170+
##############################################################################
171+
RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
172+
RUN pip install ninja
173+
RUN cd ${STAGE_DIR}/DeepSpeed && \
174+
git checkout . && \
175+
git checkout master && \
176+
DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_TRANSFORMER_INFERENCE=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 DS_BUILD_AIO=1 DS_BUILD_CPU_ADAM=1 pip install .
177+
RUN rm -rf ${STAGE_DIR}/DeepSpeed
178+
RUN python -c "import deepspeed; print(deepspeed.__version__)" && ds_report
179+
180+
WORKDIR /root

mnist_ds.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def train(args, model, train_loader, epoch):
1919
loss = F.nll_loss(output, target)
2020
model.backward(loss)
2121
model.step()
22-
if torch.distributed.get_rank() == 0:
22+
if dist.get_rank() == 0:
2323
if batch_idx % args.log_interval == 0:
2424
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
2525
epoch, dist.get_world_size() * batch_idx * len(data), len(train_loader.dataset),
@@ -41,7 +41,7 @@ def test(model, device, test_loader):
4141

4242
test_loss /= len(test_loader.dataset)
4343

44-
if torch.distributed.get_rank() == 0:
44+
if dist.get_rank() == 0:
4545
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
4646
test_loss, correct, len(test_loader.dataset),
4747
100. * correct / len(test_loader.dataset)))
@@ -86,17 +86,17 @@ def main():
8686
transforms.Normalize((0.1307,), (0.3081,))
8787
])
8888

89-
if torch.distributed.get_rank() != 0:
89+
if dist.get_rank() != 0:
9090
# might be downloading mnist data, let rank 0 download first
91-
torch.distributed.barrier()
91+
dist.barrier()
9292

9393
dataset1 = datasets.MNIST('./data', train=True, download=True, transform=transform)
9494

95-
if torch.distributed.get_rank() == 0:
95+
if dist.get_rank() == 0:
9696
# mnist data is downloaded, indicate other ranks can proceed
97-
torch.distributed.barrier()
97+
dist.barrier()
9898

99-
dataset2 = datasets.MNIST('../data', train=False, transform=transform)
99+
dataset2 = datasets.MNIST('./data', train=False, transform=transform)
100100
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
101101

102102
net = Net().to(device)
@@ -120,4 +120,4 @@ def main():
120120

121121

122122
if __name__ == '__main__':
123-
print(f'[{torch.distributed.get_rank()}] Total time elapsed: {main()} seconds')
123+
print(f'[{dist.get_rank()}] Total time elapsed: {main()} seconds')

0 commit comments

Comments
 (0)