Skip to content

Commit 24dc455

Browse files
committed
Add leases permission to mpi-operator role, update EFA tests
1 parent a5eac23 commit 24dc455

4 files changed

Lines changed: 219 additions & 29 deletions

File tree

Container-Root/eks/deployment/efa-device-plugin/test-efa.yaml

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@ spec:
1313
imagePullPolicy: IfNotPresent
1414
restartPolicy: OnFailure
1515
containers:
16+
#- image: <account>.dkr.ecr.<region>.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
1617
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
1718
name: efa-info-launcher
1819
env:
1920
- name: LD_LIBRARY_PATH
2021
value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
2122
- name: PATH
22-
value: $PATH:/opt/amazon/efa/bin
23+
value: $PATH:/opt/amazon/efa/bin:/usr/bin
2324
- name: XLA_FLAGS
2425
value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
2526
- name: TF_XLA_FLAGS
@@ -31,7 +32,7 @@ spec:
3132
- --allow-run-as-root
3233
- --tag-output
3334
- -np
34-
- "16"
35+
- "2"
3536
- -bind-to
3637
- none
3738
- -map-by
@@ -49,14 +50,16 @@ spec:
4950
- -x
5051
- NCCL_ALGO=RING
5152
- -x
52-
- FI_EFA_USE_DEVICE_RDMA=1
53+
- FI_EFA_USE_DEVICE_RDMA=0
5354
- -x
54-
- RDMAV_FORK_SAFE=1
55+
- RDMAV_FORK_SAFE=0
5556
- -x
5657
- NCCL_DEBUG
5758
- --mca
5859
- pml
5960
- ^cm
61+
- --mca
62+
- pml_rsh_agent=ssh
6063
- --oversubscribe
6164
- /opt/amazon/efa/bin/fi_info
6265
- -p
@@ -69,16 +72,17 @@ spec:
6972
spec:
7073
imagePullPolicy: IfNotPresent
7174
containers:
75+
#- image: <account>.dkr.ecr.<region>.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
7276
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
7377
name: efa-info-worker
7478
resources:
7579
limits:
76-
nvidia.com/gpu: 8
77-
hugepages-2Mi: 5120Mi
78-
vpc.amazonaws.com/efa: 4
79-
memory: 8000Mi
80+
nvidia.com/gpu: 1
81+
#hugepages-2Mi: 5120Mi
82+
vpc.amazonaws.com/efa: 1
83+
#memory: 8000Mi
8084
requests:
81-
nvidia.com/gpu: 8
82-
hugepages-2Mi: 5120Mi
83-
vpc.amazonaws.com/efa: 4
84-
memory: 8000Mi
85+
nvidia.com/gpu: 1
86+
#hugepages-2Mi: 5120Mi
87+
vpc.amazonaws.com/efa: 1
88+
#memory: 8000Mi
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
apiVersion: kubeflow.org/v2beta1
2+
kind: MPIJob
3+
metadata:
4+
name: test-nccl-efa
5+
spec:
6+
runPolicy:
7+
cleanPodPolicy: Running
8+
backoffLimit: 20
9+
slotsPerWorker: 1
10+
mpiReplicaSpecs:
11+
Launcher:
12+
replicas: 1
13+
template:
14+
spec:
15+
imagePullPolicy: IfNotPresent
16+
restartPolicy: OnFailure
17+
containers:
18+
#- image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
19+
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
20+
name: test-nccl-efa-launcher
21+
command:
22+
- /opt/amazon/openmpi/bin/mpirun
23+
- --allow-run-as-root
24+
- --tag-output
25+
- -np
26+
- "2"
27+
- -bind-to
28+
- none
29+
- -map-by
30+
- slot
31+
- -x
32+
- LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
33+
- -x
34+
- NCCL_DEBUG=INFO
35+
- -x
36+
- NCCL_ALGO=Ring
37+
- -x
38+
- FI_PROVIDER=efa
39+
- -x
40+
- FI_EFA_USE_DEVICE_RDMA=1
41+
- -x
42+
- RDMAV_FORK_SAFE=1
43+
- -x
44+
- NCCL_SHM_DISABLE=0
45+
- --mca
46+
- pml
47+
- ^cm
48+
- --oversubscribe
49+
- /opt/nccl-tests/build/all_reduce_perf
50+
- -b
51+
- "1"
52+
- -e
53+
- 1G
54+
- -f
55+
- "2"
56+
- -t
57+
- "1"
58+
- -g
59+
- "1"
60+
- -c
61+
- "1"
62+
- -n
63+
- "100"
64+
Worker:
65+
replicas: 2
66+
template:
67+
spec:
68+
#nodeSelector:
69+
#node.kubernetes.io/instance-type: "g4dn.metal"
70+
imagePullPolicy: IfNotPresent
71+
containers:
72+
#- image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
73+
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
74+
name: test-nccl-efa-worker
75+
volumeMounts:
76+
- name: shmem
77+
mountPath: /dev/shm
78+
resources:
79+
limits:
80+
nvidia.com/gpu: 1
81+
#hugepages-2Mi: 5120Mi
82+
vpc.amazonaws.com/efa: 1
83+
#memory: 8000Mi
84+
requests:
85+
nvidia.com/gpu: 1
86+
#hugepages-2Mi: 5120Mi
87+
vpc.amazonaws.com/efa: 1
88+
#memory: 8000Mi
89+
volumes:
90+
- name: shmem
91+
hostPath:
92+
path: /dev/shm

Container-Root/eks/deployment/efa-device-plugin/test-nccl-efa.yaml

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,6 @@ spec:
1818
#- image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
1919
- image: public.ecr.aws/w6p6i9i7/aws-efa-nccl-rdma:base-cudnn8-cuda11-ubuntu18.04
2020
name: test-nccl-efa-launcher
21-
#env:
22-
#- name: LD_LIBRARY_PATH
23-
# value: /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
24-
#value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
25-
#- name: PATH
26-
# value: /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
27-
# value: $PATH:/opt/amazon/efa/bin
2821
command:
2922
- /opt/amazon/openmpi/bin/mpirun
3023
- --allow-run-as-root
@@ -36,19 +29,17 @@ spec:
3629
- -map-by
3730
- slot
3831
- -x
39-
- PATH
40-
- -x
41-
- LD_LIBRARY_PATH
32+
- LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
4233
- -x
4334
- NCCL_DEBUG=INFO
4435
- -x
4536
- NCCL_ALGO=Ring
4637
- -x
4738
- FI_PROVIDER=efa
4839
- -x
49-
- FI_EFA_USE_DEVICE_RDMA=1
40+
- FI_EFA_USE_DEVICE_RDMA=0
5041
- -x
51-
- RDMAV_FORK_SAFE=1
42+
- RDMAV_FORK_SAFE=0
5243
- -x
5344
- NCCL_SHM_DISABLE=0
5445
- --mca
@@ -75,9 +66,7 @@ spec:
7566
template:
7667
spec:
7768
#nodeSelector:
78-
#beta.kubernetes.io/instance-type: "p4d.24xlarge"
79-
#beta.kubernetes.io/instance-type: "p3dn.24xlarge"
80-
#beta.kubernetes.io/instance-type: "g4dn.metal"
69+
#node.kubernetes.io/instance-type: "g4dn.metal"
8170
imagePullPolicy: IfNotPresent
8271
containers:
8372
#- image: <account>.dkr.ecr.us-west-2.amazonaws.com/cuda-efa-nccl-tests:ubuntu18.04
@@ -91,12 +80,12 @@ spec:
9180
nvidia.com/gpu: 1
9281
#hugepages-2Mi: 5120Mi
9382
vpc.amazonaws.com/efa: 1
94-
memory: 8000Mi
83+
#memory: 8000Mi
9584
requests:
9685
nvidia.com/gpu: 1
9786
#hugepages-2Mi: 5120Mi
9887
vpc.amazonaws.com/efa: 1
99-
memory: 8000Mi
88+
#memory: 8000Mi
10089
volumes:
10190
- name: shmem
10291
hostPath:
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
annotations:
5+
labels:
6+
app: mpi-operator
7+
app.kubernetes.io/component: mpijob
8+
app.kubernetes.io/name: mpi-operator
9+
kustomize.component: mpi-operator
10+
name: mpi-operator
11+
rules:
12+
- apiGroups:
13+
- ""
14+
resources:
15+
- configmaps
16+
- secrets
17+
- services
18+
verbs:
19+
- create
20+
- list
21+
- watch
22+
- update
23+
- apiGroups:
24+
- ""
25+
resources:
26+
- pods
27+
verbs:
28+
- create
29+
- get
30+
- list
31+
- watch
32+
- delete
33+
- update
34+
- patch
35+
- apiGroups:
36+
- ""
37+
resources:
38+
- pods/exec
39+
verbs:
40+
- create
41+
- apiGroups:
42+
- ""
43+
resources:
44+
- endpoints
45+
verbs:
46+
- create
47+
- get
48+
- update
49+
- apiGroups:
50+
- ""
51+
resources:
52+
- events
53+
verbs:
54+
- create
55+
- patch
56+
- apiGroups:
57+
- apps
58+
resources:
59+
- statefulsets
60+
verbs:
61+
- create
62+
- list
63+
- update
64+
- watch
65+
- apiGroups:
66+
- batch
67+
resources:
68+
- jobs
69+
verbs:
70+
- create
71+
- list
72+
- update
73+
- watch
74+
- apiGroups:
75+
- apiextensions.k8s.io
76+
resources:
77+
- customresourcedefinitions
78+
verbs:
79+
- create
80+
- get
81+
- apiGroups:
82+
- kubeflow.org
83+
resources:
84+
- mpijobs
85+
- mpijobs/finalizers
86+
- mpijobs/status
87+
verbs:
88+
- '*'
89+
- apiGroups:
90+
- scheduling.incubator.k8s.io
91+
- scheduling.sigs.dev
92+
resources:
93+
- queues
94+
- podgroups
95+
verbs:
96+
- '*'
97+
- apiGroups:
98+
- coordination.k8s.io
99+
resources:
100+
- leases
101+
verbs:
102+
- get
103+
- create
104+
- delete
105+
- update

0 commit comments

Comments
 (0)