forked from dstackai/dstack
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpd.dstack.yml
More file actions
62 lines (56 loc) · 1.32 KB
/
pd.dstack.yml
File metadata and controls
62 lines (56 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
type: service
name: prefill-decode
image: lmsysorg/sglang:latest
env:
- HF_TOKEN
- MODEL_ID=zai-org/GLM-4.5-Air-FP8
replicas:
- count: 1
# For now replica group with router must have count: 1
commands:
- pip install sglang_router
- |
python -m sglang_router.launch_router \
--host 0.0.0.0 \
--port 8000 \
--pd-disaggregation \
--prefill-policy cache_aware
router:
type: sglang
resources:
cpu: 4
- count: 1..4
scaling:
metric: rps
target: 3
commands:
- |
python -m sglang.launch_server \
--model-path $MODEL_ID \
--disaggregation-mode prefill \
--disaggregation-transfer-backend nixl \
--host 0.0.0.0 \
--port 8000 \
--disaggregation-bootstrap-port 8998
resources:
gpu: H200
- count: 1..8
scaling:
metric: rps
target: 2
commands:
- |
python -m sglang.launch_server \
--model-path $MODEL_ID \
--disaggregation-mode decode \
--disaggregation-transfer-backend nixl \
--host 0.0.0.0 \
--port 8000
resources:
gpu: H200
port: 8000
model: zai-org/GLM-4.5-Air-FP8
probes:
- type: http
url: /health
interval: 15s