Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ jobs:
# platforms: linux/amd64
# runner: [self-hosted, linux/amd64, rocm]
# build_args: "NUM_MAKE_JOBS=16"
- name: rocm6.3
dockerfile: rocm6.3.x
tags: superbench/main:rocm6.3
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
145 changes: 145 additions & 0 deletions dockerfile/rocm6.3.x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
ARG BASE_IMAGE=rocm/pytorch-training:v25.6

FROM ${BASE_IMAGE}

# Base image: rocm/pytorch-training:v25.6
# Pre-installed by base image:
# - Ubuntu: 22.04
# - Python: 3.10
# - ROCm: 6.3.4
# - openmpi: 4.0.7rc2
# - torch: 2.8.0a0+git7d205b2
# - rccl: 2.21.5.60304-76
# - hipblaslt: 0.15.0-8c69191d
# - transformer_engine: 1.14.0+2f85f5f2
# - flash_attention: 3.0.0.post1
# - cmake: 3.18.5
# - rocm-cmake: 0.14.0.60304-76
# - amd-smi: 25.1.0+8dc45db
# Added by this Dockerfile:
# - Docker Client: 27.5.1
# - mlc: v3.12
# - OFED: 24.10-1.1.4.0 LTS

# Fix base image botocore/urllib3 incompatibility:
# Base image ships botocore 1.22.12 (expects urllib3 1.x) with urllib3 2.6.3,
# causing "cannot import name 'DEFAULT_CIPHERS' from 'urllib3.util.ssl_'".
# Upgrading botocore/boto3 to versions compatible with urllib3 2.x.
RUN python3 -m pip install --upgrade botocore boto3
Comment thread
polarG marked this conversation as resolved.
Outdated
Comment thread
polarG marked this conversation as resolved.
Outdated

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=64

Comment thread
polarG marked this conversation as resolved.
Outdated
# Install Docker
ENV DOCKER_VERSION=27.5.1
RUN cd /tmp && \
Comment thread
polarG marked this conversation as resolved.
Outdated
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
Comment thread
polarG marked this conversation as resolved.


# Get Ubuntu version and set as an environment variable
RUN echo "Ubuntu version: $(lsb_release -r -s)"
ARG UBUNTU_VERSION=22.04

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
# Check if ofed_info is present and has a version
Comment thread
polarG marked this conversation as resolved.
Outdated
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
Comment thread
polarG marked this conversation as resolved.
Outdated
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
Comment thread
polarG marked this conversation as resolved.
Outdated
Comment thread
polarG marked this conversation as resolved.
Outdated
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi
Comment thread
polarG marked this conversation as resolved.
Outdated

ENV ROCM_PATH=/opt/rocm

# Target GPU architectures for ROCm builds (space-separated)
ENV AMDGPU_TARGETS="gfx908 gfx90a gfx942"

# Use pre-installed OpenMPI from base image at /opt/ompi
ENV MPI_HOME=/opt/ompi

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

ENV PATH="/opt/ompi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_LIBRARY_PATH="/opt/ompi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

RUN python3 -m pip install --upgrade pip wheel setuptools==65.7 && \
python3 -c "import pkg_resources" || python3 -m pip install setuptools
Comment thread
polarG marked this conversation as resolved.
Outdated

Comment thread
polarG marked this conversation as resolved.
Outdated
WORKDIR ${SB_HOME}

ADD third_party third_party

RUN make RCCL_HOME=/opt/rocm ROCBLAS_BRANCH=release-staging/rocm-rel-6.3 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.3 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_megatron_lm
Comment thread
polarG marked this conversation as resolved.
Comment thread
polarG marked this conversation as resolved.

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install --no-build-isolation .[amdworker] && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall
Comment thread
polarG marked this conversation as resolved.
Outdated
Comment thread
polarG marked this conversation as resolved.
Outdated
Comment thread
polarG marked this conversation as resolved.
Outdated
9 changes: 9 additions & 0 deletions third_party/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,17 @@ ifneq (,$(wildcard fio/Makefile))
endif

# Build rccl-tests from commit 46375b1 of default branch.
# If AMDGPU_TARGETS env var is set (space-separated, e.g. "gfx908 gfx90a gfx942"),
# explicit --offload-arch flags and include paths are used.
# Otherwise, the original build command is used (relies on hipcc auto-detection).
ROCM_OFFLOAD_ARCH_FLAGS := $(foreach arch,$(AMDGPU_TARGETS),--offload-arch=$(arch))
rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile))
ifdef AMDGPU_TARGETS
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIPCUFLAGS="-std=c++14 -O3 $(ROCM_OFFLOAD_ARCH_FLAGS) -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/rccl -I$(ROCM_PATH)/include/hip -DMPI_SUPPORT -I$(MPI_HOME)/include -I$(MPI_HOME)/include/mpi" -j
else
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
endif
Comment thread
polarG marked this conversation as resolved.
Outdated
cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif

Expand Down Expand Up @@ -168,6 +176,7 @@ rocm_hipblaslt: sb_micro_path
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.

rocm_bandwidthTest: sb_micro_path
git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git
cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
Expand Down
Loading