File tree Expand file tree Collapse file tree
Container-Root/eks/deployment Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -13,13 +13,14 @@ spec:
1313 imagePullPolicy : IfNotPresent
1414 restartPolicy : OnFailure
1515 containers :
16+ # - image: <account>.dkr.ecr.<region>.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
1617 - image : public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
1718 name : efa-info-launcher
1819 env :
1920 - name : LD_LIBRARY_PATH
2021 value : /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
2122 - name : PATH
22- value : $PATH:/opt/amazon/efa/bin
23+ value : $PATH:/opt/amazon/efa/bin:/usr/bin
2324 - name : XLA_FLAGS
2425 value : " --xla_gpu_cuda_data_dir=/usr/local/cuda"
2526 - name : TF_XLA_FLAGS
3132 - --allow-run-as-root
3233 - --tag-output
3334 - -np
34- - " 16 "
35+ - " 2 "
3536 - -bind-to
3637 - none
3738 - -map-by
@@ -49,14 +50,16 @@ spec:
4950 - -x
5051 - NCCL_ALGO=RING
5152 - -x
52- - FI_EFA_USE_DEVICE_RDMA=1
53+ - FI_EFA_USE_DEVICE_RDMA=0
5354 - -x
54- - RDMAV_FORK_SAFE=1
55+ - RDMAV_FORK_SAFE=0
5556 - -x
5657 - NCCL_DEBUG
5758 - --mca
5859 - pml
5960 - ^cm
61+ - --mca
62+ - pml_rsh_agent=ssh
6063 - --oversubscribe
6164 - /opt/amazon/efa/bin/fi_info
6265 - -p
@@ -69,16 +72,17 @@ spec:
6972 spec :
7073 imagePullPolicy : IfNotPresent
7174 containers :
75+ # - image: <account>.dkr.ecr.<region>.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
7276 - image : public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
7377 name : efa-info-worker
7478 resources :
7579 limits :
76- nvidia.com/gpu : 8
77- hugepages-2Mi : 5120Mi
78- vpc.amazonaws.com/efa : 4
79- memory : 8000Mi
80+ nvidia.com/gpu : 1
81+ # hugepages-2Mi: 5120Mi
82+ vpc.amazonaws.com/efa : 1
83+ # memory: 8000Mi
8084 requests :
81- nvidia.com/gpu : 8
82- hugepages-2Mi : 5120Mi
83- vpc.amazonaws.com/efa : 4
84- memory : 8000Mi
85+ nvidia.com/gpu : 1
86+ # hugepages-2Mi: 5120Mi
87+ vpc.amazonaws.com/efa : 1
88+ # memory: 8000Mi
Original file line number Diff line number Diff line change 1+ apiVersion : kubeflow.org/v2beta1
2+ kind : MPIJob
3+ metadata :
4+ name : test-nccl-efa
5+ spec :
6+ runPolicy :
7+ cleanPodPolicy : Running
8+ backoffLimit : 20
9+ slotsPerWorker : 1
10+ mpiReplicaSpecs :
11+ Launcher :
12+ replicas : 1
13+ template :
14+ spec :
15+ imagePullPolicy : IfNotPresent
16+ restartPolicy : OnFailure
17+ containers :
18+ # - image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
19+ - image : public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
20+ name : test-nccl-efa-launcher
21+ command :
22+ - /opt/amazon/openmpi/bin/mpirun
23+ - --allow-run-as-root
24+ - --tag-output
25+ - -np
26+ - " 2"
27+ - -bind-to
28+ - none
29+ - -map-by
30+ - slot
31+ - -x
32+ - LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
33+ - -x
34+ - NCCL_DEBUG=INFO
35+ - -x
36+ - NCCL_ALGO=Ring
37+ - -x
38+ - FI_PROVIDER=efa
39+ - -x
40+ - FI_EFA_USE_DEVICE_RDMA=1
41+ - -x
42+ - RDMAV_FORK_SAFE=1
43+ - -x
44+ - NCCL_SHM_DISABLE=0
45+ - --mca
46+ - pml
47+ - ^cm
48+ - --oversubscribe
49+ - /opt/nccl-tests/build/all_reduce_perf
50+ - -b
51+ - " 1"
52+ - -e
53+ - 1G
54+ - -f
55+ - " 2"
56+ - -t
57+ - " 1"
58+ - -g
59+ - " 1"
60+ - -c
61+ - " 1"
62+ - -n
63+ - " 100"
64+ Worker :
65+ replicas : 2
66+ template :
67+ spec :
68+ # nodeSelector:
69+ # node.kubernetes.io/instance-type: "g4dn.metal"
70+ imagePullPolicy : IfNotPresent
71+ containers :
72+ # - image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
73+ - image : public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
74+ name : test-nccl-efa-worker
75+ volumeMounts :
76+ - name : shmem
77+ mountPath : /dev/shm
78+ resources :
79+ limits :
80+ nvidia.com/gpu : 1
81+ # hugepages-2Mi: 5120Mi
82+ vpc.amazonaws.com/efa : 1
83+ # memory: 8000Mi
84+ requests :
85+ nvidia.com/gpu : 1
86+ # hugepages-2Mi: 5120Mi
87+ vpc.amazonaws.com/efa : 1
88+ # memory: 8000Mi
89+ volumes :
90+ - name : shmem
91+ hostPath :
92+ path : /dev/shm
Original file line number Diff line number Diff line change 1818 # - image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
1919 - image : public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
2020 name : test-nccl-efa-launcher
21- # env:
22- # - name: LD_LIBRARY_PATH
23- # value: /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
24- # value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
25- # - name: PATH
26- # value: /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
27- # value: $PATH:/opt/amazon/efa/bin
2821 command :
2922 - /opt/amazon/openmpi/bin/mpirun
3023 - --allow-run-as-root
@@ -36,19 +29,17 @@ spec:
3629 - -map-by
3730 - slot
3831 - -x
39- - PATH
40- - -x
41- - LD_LIBRARY_PATH
32+ - LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
4233 - -x
4334 - NCCL_DEBUG=INFO
4435 - -x
4536 - NCCL_ALGO=Ring
4637 - -x
4738 - FI_PROVIDER=efa
4839 - -x
49- - FI_EFA_USE_DEVICE_RDMA=1
40+ - FI_EFA_USE_DEVICE_RDMA=0
5041 - -x
51- - RDMAV_FORK_SAFE=1
42+ - RDMAV_FORK_SAFE=0
5243 - -x
5344 - NCCL_SHM_DISABLE=0
5445 - --mca
7566 template :
7667 spec :
7768 # nodeSelector:
78- # beta.kubernetes.io/instance-type: "p4d.24xlarge"
79- # beta.kubernetes.io/instance-type: "p3dn.24xlarge"
80- # beta.kubernetes.io/instance-type: "g4dn.metal"
69+ # node.kubernetes.io/instance-type: "g4dn.metal"
8170 imagePullPolicy : IfNotPresent
8271 containers :
8372 # - image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
@@ -91,12 +80,12 @@ spec:
9180 nvidia.com/gpu : 1
9281 # hugepages-2Mi: 5120Mi
9382 vpc.amazonaws.com/efa : 1
94- memory : 8000Mi
83+ # memory: 8000Mi
9584 requests :
9685 nvidia.com/gpu : 1
9786 # hugepages-2Mi: 5120Mi
9887 vpc.amazonaws.com/efa : 1
99- memory : 8000Mi
88+ # memory: 8000Mi
10089 volumes :
10190 - name : shmem
10291 hostPath :
Original file line number Diff line number Diff line change 1+ apiVersion : rbac.authorization.k8s.io/v1
2+ kind : ClusterRole
3+ metadata :
4+ annotations :
5+ labels :
6+ app : mpi-operator
7+ app.kubernetes.io/component : mpijob
8+ app.kubernetes.io/name : mpi-operator
9+ kustomize.component : mpi-operator
10+ name : mpi-operator
11+ rules :
12+ - apiGroups :
13+ - " "
14+ resources :
15+ - configmaps
16+ - secrets
17+ - services
18+ verbs :
19+ - create
20+ - list
21+ - watch
22+ - update
23+ - apiGroups :
24+ - " "
25+ resources :
26+ - pods
27+ verbs :
28+ - create
29+ - get
30+ - list
31+ - watch
32+ - delete
33+ - update
34+ - patch
35+ - apiGroups :
36+ - " "
37+ resources :
38+ - pods/exec
39+ verbs :
40+ - create
41+ - apiGroups :
42+ - " "
43+ resources :
44+ - endpoints
45+ verbs :
46+ - create
47+ - get
48+ - update
49+ - apiGroups :
50+ - " "
51+ resources :
52+ - events
53+ verbs :
54+ - create
55+ - patch
56+ - apiGroups :
57+ - apps
58+ resources :
59+ - statefulsets
60+ verbs :
61+ - create
62+ - list
63+ - update
64+ - watch
65+ - apiGroups :
66+ - batch
67+ resources :
68+ - jobs
69+ verbs :
70+ - create
71+ - list
72+ - update
73+ - watch
74+ - apiGroups :
75+ - apiextensions.k8s.io
76+ resources :
77+ - customresourcedefinitions
78+ verbs :
79+ - create
80+ - get
81+ - apiGroups :
82+ - kubeflow.org
83+ resources :
84+ - mpijobs
85+ - mpijobs/finalizers
86+ - mpijobs/status
87+ verbs :
88+ - ' *'
89+ - apiGroups :
90+ - scheduling.incubator.k8s.io
91+ - scheduling.sigs.dev
92+ resources :
93+ - queues
94+ - podgroups
95+ verbs :
96+ - ' *'
97+ - apiGroups :
98+ - coordination.k8s.io
99+ resources :
100+ - leases
101+ verbs :
102+ - get
103+ - create
104+ - delete
105+ - update
You can’t perform that action at this time.
0 commit comments