dstack/examples/inference/sglang/pd.dstack.yml at 34baf13d43df3346a005278c6028116262e7cc81 · Bihan/dstack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
type: service
name: prefill-decode
image: lmsysorg/sglang:latest

env:
  - HF_TOKEN
  - MODEL_ID=zai-org/GLM-4.5-Air-FP8

replicas:
  - count: 1
    # For now replica group with router must have count: 1
    commands:
      - pip install sglang_router
      - |
          python -m sglang_router.launch_router \
            --host 0.0.0.0 \
            --port 8000 \
            --pd-disaggregation \
            --prefill-policy cache_aware
    router:
      type: sglang
    resources:
      cpu: 4

  - count: 1..4
    scaling:
      metric: rps
      target: 3
    commands:
      - |
          python -m sglang.launch_server \
            --model-path $MODEL_ID \
            --disaggregation-mode prefill \
            --disaggregation-transfer-backend nixl \
            --host 0.0.0.0 \
            --port 8000 \
            --disaggregation-bootstrap-port 8998
    resources:
      gpu: H200

  - count: 1..8
    scaling:
      metric: rps
      target: 2
    commands:
      - |
          python -m sglang.launch_server \
            --model-path $MODEL_ID \
            --disaggregation-mode decode \
            --disaggregation-transfer-backend nixl \
            --host 0.0.0.0 \
            --port 8000
    resources:
      gpu: H200

port: 8000
model: zai-org/GLM-4.5-Air-FP8

probes:
  - type: http
    url: /health
    interval: 15s