aws-do-ray/Container-Root/ray/raycluster/raycluster-template.yaml at main · aws-samples/aws-do-ray · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
  name: rayml
  labels:
    controller-tools.k8s.io: "1.0"
spec:
  # Ray head pod template
  headGroupSpec:
    # The `rayStartParams` are used to configure the `ray start` command.
    # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
    # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
    rayStartParams:
      dashboard-host: '0.0.0.0'
    #pod template
    template:
      spec:
        #nodeSelector:
          #node.kubernetes.io/instance-type: "ml.m5.2xlarge"
        securityContext:
          runAsUser: 0
          runAsGroup: 0
          fsGroup: 0
        containers:
        - name: ray-head
          image: ${REGISTRY}aws-ray-custom:latest     ## IMAGE: Here you may choose which image your head pod will run
          env:                                ## ENV: Here is where you can send stuff to the head pod
            - name: RAY_GRAFANA_IFRAME_HOST   ## PROMETHEUS AND GRAFANA
              value: http://localhost:3000
            - name: RAY_GRAFANA_HOST
              value: http://prometheus-grafana.prometheus-system.svc:80
            - name: RAY_PROMETHEUS_HOST
              value: http://prometheus-kube-prometheus-prometheus.prometheus-system.svc:9090
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
          resources:
            limits:                                    ## LIMITS: Set resource limits for your head pod
              cpu: 1
              memory: 8Gi
            requests:                                    ## REQUESTS: Set resource requests for your head pod
              cpu: 1
              memory: 8Gi
          ports:
          - containerPort: 6379
            name: gcs-server
          - containerPort: 8265 # Ray dashboard
            name: dashboard
          - containerPort: 10001
            name: client
          - containerPort: 8000
            name: serve
          volumeMounts:
          - name: fsx-storage
            mountPath: /fsx
          - name: ray-logs
            mountPath: /tmp/ray
        volumes:
          - name: ray-logs
            emptyDir: {}
          - name: fsx-storage
            persistentVolumeClaim:
              claimName: fsx-claim
  workerGroupSpecs:
  # the pod replicas in this group typed worker
  - replicas: 4                                    ## REPLICAS: How many worker pods you want
    minReplicas: 1
    maxReplicas: 10
    # logical group name, for this called small-group, also can be functional
    groupName: gpu-group
    rayStartParams:
      num-gpus: "1"
    #pod template
    template:
      spec:
        #nodeSelector:
        # node.kubernetes.io/instance-type: "ml.g5.8xlarge"
        # sagemaker.amazonaws.com/node-health-status: Schedulable
        securityContext:
          runAsUser: 0
          runAsGroup: 0
          fsGroup: 0
        containers:
        - name: ray-worker
          image: ${REGISTRY}aws-ray-custom:latest             ## IMAGE: Here you may choose which image your head node will run
          env:
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
          resources:
            limits:                                    ## LIMITS: Set resource limits for your worker pods
              cpu: 4
              memory: 24Gi
              nvidia.com/gpu: 1
            requests:                                    ## REQUESTS: Set resource requests for your worker pods
              cpu: 4
              memory: 24Gi
              nvidia.com/gpu: 1
          volumeMounts:                                    ## VOLUMEMOUNTS: Mount your S3 CSI EKS Add-On to worker pods
          - name: ray-logs
            mountPath: /tmp/ray
          - name: fsx-storage
            mountPath: /fsx
        volumes:
        - name: fsx-storage
          persistentVolumeClaim:
            claimName: fsx-claim
        - name: ray-logs
          emptyDir: {}
---
# UNCOMMENT BELOW IF YOU HAVE DEPLOYED PROMETHEUS

#apiVersion: monitoring.coreos.com/v1
#kind: PodMonitor
#metadata:
#  name: ray-workers-monitor
#  namespace: prometheus-system
#  labels:
#    release: prometheus
#    ray.io/cluster: rayml # $RAY_CLUSTER_NAME: "kubectl get rayclusters.ray.io"
#spec:
#  jobLabel: ray-workers
#  namespaceSelector:
#    matchNames:
#      - default
#  selector:
#    matchLabels:
#      ray.io/node-type: worker
#  podMetricsEndpoints:
#  - port: metrics