Skip to content

Commit dff06fa

Browse files
authored
Merge pull request #11 from aws-samples/deepspeed
Deepspeed and distributed training examples
2 parents 124861e + b1fd3e5 commit dff06fa

74 files changed

Lines changed: 1157 additions & 85 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.env

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,40 @@ export http_proxy=
1313
export https_proxy=
1414
export no_proxy=localhost
1515

16+
# AWS settings
17+
## AWS_PROFILE - name of AWS settings profile AWS_PROFILE=default(default)|aws-do-eks|...
18+
export AWS_PROFILE=default
19+
## AWS_REGION - will be set to AWS_DEFAULT_REGION if not set externally.
20+
export AWS_DEFAULT_REGION=us-east-1
21+
if [ "${AWS_REGION}" == "" ]; then
22+
export AWS_REGION=$AWS_DEFAULT_REGION
23+
fi
1624
# Docker image settings
1725
## REGISTRY: [optional] - Docker registry path including trailing "/". Example: registry.company.com/demo/
18-
export REGISTRY=
26+
## If REGISTRY==default, then the default elastic container registry in the account for the current region will be used
27+
export REGISTRY=default
28+
## Set default registry if needed
29+
if [ "$REGISTRY" == "default" ]; then
30+
export REGION=${AWS_REGION}
31+
export ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
32+
if [ "$ACCOUNT" == "" ]; then
33+
export REGISTRY=""
34+
else
35+
export REGISTRY=${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/
36+
fi
37+
fi
38+
## Add trailing forward slash if needed
1939
if [ -n "${REGISTRY}" ]; then
2040
if [ "${REGISTRY: -1}" != "/" ]; then
2141
export REGISTRY="${REGISTRY}/"
2242
fi
2343
fi
44+
2445
## IMAGE: <required> - Docker image name for this project. Example: myapp
2546
export IMAGE=aws-do-eks
2647
## VERSION: [optional] - Version tag for this Docker image. Example: v20180302
2748
#export VERSION=v$(date +%Y%m%d)
28-
export VERSION=v3-20220707
49+
export VERSION=v4-20220801
2950
export TAG=$(if [ -z "${VERSION}" ]; then echo ""; else echo ":${VERSION}"; fi)
3051
## BUILD_OPTS: [optional] - arguments for the docker image build command
3152
export BUILD_OPTS="--progress plain --build-arg http_proxy=${http_proxy} --build-arg https_proxy=${https_proxy} --build-arg no_proxy=${no_proxy}"
@@ -37,9 +58,9 @@ export CONTAINER_NAME="--name ${CONTAINER}"
3758
## Port map [optional] - Mapping of external to internal ports including the -p switch. Example -p 80:8080
3859
#export PORT_MAP="-p 80:8080"
3960
## Volume map [optional] - Mapping of external to internal paths including the -v switch. Example $(pwd):/wd
40-
export VOL_MAP="-v ${HOME}/.aws:/root/.aws -v ${HOME}/.kube:/root/.kube -v $(pwd)/wd/conf/eks.conf:/eks/eks.conf -v $(pwd)/wd/conf/eks.yaml:/eks/eks.yaml -v $(pwd):/aws-do-eks"
61+
export VOL_MAP="-v ${HOME}/.aws:/root/.aws -v ${HOME}/.kube:/root/.kube -v $(pwd)/wd/conf/eks.conf:/eks/eks.conf -v $(pwd)/wd/conf/eks.yaml:/eks/eks.yaml -v $(pwd):/aws-do-eks -v /var/run/docker.sock:/var/run/docker.sock"
4162
## Network [optional] - Network name including the --net switch. Example --net mynet
4263
#export NETWORK=
4364
## RUN_OPTS [optional] - additional options to specify with the run comman. Example -e POSTGRES_DB=dbname
44-
export RUN_OPTS="-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy"
65+
export RUN_OPTS="-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e AWS_PROFILE=$AWS_PROFILE -e REGION=$REGION"
4566

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@ Container-Root/eks/deployment/cluster-autoscaler/cluster-autoscaler.yaml
22
Container-Root/eks/deployment/aws-load-balancer-controller/aws-load-balancer-controller.yaml
33
Container-Root/eks/deployment/kube-ops-view/kube-ops-view
44
.DS_Store
5+
Container-Root/eks/deployment/distributed-training/pytorch/habana/deepspeed-bert/efs-get-data.yaml
6+
Container-Root/eks/deployment/distributed-training/pytorch/habana/deepspeed-bert/deepspeed-bert.yaml
7+
Container-Root/eks/deployment/distributed-training/tensorflow/habana/mpijob-mnist.yaml
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
---
2+
apiVersion: networking.k8s.io/v1
3+
kind: Ingress
4+
metadata:
5+
namespace: ${NAMESPACE}
6+
name: ${INGRESS_NAME}
7+
annotations:
8+
alb.ingress.kubernetes.io/scheme: internet-facing
9+
alb.ingress.kubernetes.io/target-type: ip
10+
spec:
11+
ingressClassName: alb
12+
rules:
13+
- path: /
14+
pathType: Prefix
15+
backend:
16+
service:
17+
name: ${SERVICE_NAME}
18+
port:
19+
number: ${SERVICE_PORT}

Container-Root/eks/deployment/cluster-autoscaler/set-iam-role.sh

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,27 @@ source ../../eks.conf
66
IAM_POLICY=$(aws iam list-policies --no-paginate | grep PolicyName | grep AmazonEKSClusterAutoscalerPolicy)
77

88
if [ "$IAM_POLICY" == "" ]; then
9-
echo "Creating Cluster Autoscaler IAM Policy ..."
10-
json_out=$(aws iam create-policy --policy-name AmazonEKSClusterAutoscalerPolicy --policy-document file://cluster-autoscaler-policy.json)
11-
arn=$(echo $json_out | jq '.Policy.Arn')
12-
POLICY_ARN=$(echo $arn | sed -e 's/\"//g')
13-
echo "POLICY_ARN=$POLICY_ARN"
9+
echo "Creating Cluster Autoscaler IAM Policy ..."
10+
json_out=$(aws iam create-policy --policy-name AmazonEKSClusterAutoscalerPolicy --policy-document file://cluster-autoscaler-policy.json)
11+
arn=$(echo $json_out | jq '.Policy.Arn')
12+
POLICY_ARN=$(echo $arn | sed -e 's/\"//g')
13+
echo "POLICY_ARN=$POLICY_ARN"
1414
else
15-
echo "IAM Policy $IAM_POLICY alreqady exists"
15+
echo "IAM Policy $IAM_POLICY already exists"
16+
POLICY_ARN=$(aws iam list-policies --no-paginate | grep AmazonEKSClusterAutoscalerPolicy | grep Arn | cut -d "\"" -f 4)
1617
fi
1718

1819
# Create service account cluster-autoscaler with attached policy if it does not exist
1920
output=$(eksctl get iamserviceaccount --cluster $CLUSTER_NAME --output json)
2021
clean_out=$(echo ${output##*[} )
2122
json_out="[ $clean_out"
2223
IAM_SA_NAMES=$(echo $json_out | jq -r '.[].metadata.name')
23-
IAM_SA_NAME=$(echo $IAM_SA_NAMES | grep cluster-autoscaler)
24+
IAM_SA_NAME=$(echo $IAM_SA_NAMES | grep cluster-autoscaler)
2425

2526
if [ "$IAM_SA_NAME" == "" ]; then
26-
echo "Creating IAM Service Account cluster-autoscaler ..."
27-
eksctl create iamserviceaccount --cluster=$CLUSTER_NAME --namespace=kube-system --name=cluster-autoscaler \
28-
--attach-policy-arn=$POLICY_ARN --override-existing-serviceaccounts --approve
27+
echo "Creating IAM Service Account cluster-autoscaler ..."
28+
eksctl create iamserviceaccount --cluster=$CLUSTER_NAME --namespace=kube-system --name=cluster-autoscaler \
29+
--attach-policy-arn=$POLICY_ARN --override-existing-serviceaccounts --approve
2930
else
30-
echo "IAM Service Account $IAM_SA_NAME already exists"
31+
echo "IAM Service Account $IAM_SA_NAME already exists"
3132
fi
32-
33-

Container-Root/eks/deployment/distributed-training/pytorch/README.md

Lines changed: 0 additions & 7 deletions
This file was deleted.
Lines changed: 8 additions & 0 deletions

Container-Root/eks/deployment/distributed-training/pytorch/config/crd/bases/elastic.pytorch.org_elasticjobs.yaml renamed to Container-Root/eks/deployment/distributed-training/pytorch/elasticjob/config/crd/bases/elastic.pytorch.org_elasticjobs.yaml

File renamed without changes.

Container-Root/eks/deployment/distributed-training/pytorch/config/crd/kustomization.yaml renamed to Container-Root/eks/deployment/distributed-training/pytorch/elasticjob/config/crd/kustomization.yaml

File renamed without changes.

Container-Root/eks/deployment/distributed-training/pytorch/config/crd/kustomizeconfig.yaml renamed to Container-Root/eks/deployment/distributed-training/pytorch/elasticjob/config/crd/kustomizeconfig.yaml

File renamed without changes.

Container-Root/eks/deployment/distributed-training/pytorch/config/default/kustomization.yaml renamed to Container-Root/eks/deployment/distributed-training/pytorch/elasticjob/config/default/kustomization.yaml

File renamed without changes.

0 commit comments

Comments
 (0)