File tree Expand file tree Collapse file tree 3 files changed +0
-12
lines changed
docs/source/user-guide/pd-disaggregation Expand file tree Collapse file tree 3 files changed +0
-12
lines changed Original file line number Diff line number Diff line change @@ -13,14 +13,12 @@ For illustration purposes, let us take GPU as an example and assume the model us
1313### Run prefill server
1414Prefiller Launch Command:
1515``` bash
16- export PYTHONHASHSEED=123456
1716export CUDA_VISIBLE_DEVICES=0
1817vllm serve /home/models/Qwen2.5-7B-Instruct \
1918--max-model-len 20000 \
2019--tensor-parallel-size 1 \
2120--gpu_memory_utilization 0.87 \
2221--trust-remote-code \
23- --enforce-eager \
2422--no-enable-prefix-caching \
2523--port 7800 \
2624--block-size 128 \
@@ -42,14 +40,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
4240### Run decode server
4341Decoder Launch Command:
4442``` bash
45- export PYTHONHASHSEED=123456
4643export CUDA_VISIBLE_DEVICES=0
4744vllm serve /home/models/Qwen2.5-7B-Instruct \
4845--max-model-len 20000 \
4946--tensor-parallel-size 1 \
5047--gpu_memory_utilization 0.87 \
5148--trust-remote-code \
52- --enforce-eager \
5349--no-enable-prefix-caching \
5450--port 7801 \
5551--block-size 128 \
Original file line number Diff line number Diff line change @@ -19,14 +19,12 @@ For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instr
1919### Run prefill server
2020Prefiller Launch Command:
2121``` bash
22- export PYTHONHASHSEED=123456
2322export ASCEND_RT_VISIBLE_DEVICES=0
2423vllm serve /home/models/Qwen2.5-7B-Instruct \
2524--max-model-len 20000 \
2625--tensor-parallel-size 1 \
2726--gpu_memory_utilization 0.87 \
2827--trust-remote-code \
29- --enforce-eager \
3028--no-enable-prefix-caching \
3129--port 7800 \
3230--block-size 128 \
@@ -49,14 +47,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
4947### Run decode server
5048Decoder Launch Command:
5149``` bash
52- export PYTHONHASHSEED=123456
5350export CUDA_VISIBLE_DEVICES=0
5451vllm serve /home/models/Qwen2.5-7B-Instruct \
5552--max-model-len 20000 \
5653--tensor-parallel-size 1 \
5754--gpu_memory_utilization 0.87 \
5855--trust-remote-code \
59- --enforce-eager \
6056--no-enable-prefix-caching \
6157--port 7801 \
6258--block-size 128 \
Original file line number Diff line number Diff line change @@ -13,14 +13,12 @@ For illustration purposes, let us take GPU as an example and assume the model us
1313### Run prefill servers
1414Prefiller1 Launch Command:
1515``` bash
16- export PYTHONHASHSEED=123456
1716export CUDA_VISIBLE_DEVICES=0
1817vllm serve /home/models/Qwen2.5-7B-Instruct \
1918--max-model-len 20000 \
2019--tensor-parallel-size 1 \
2120--gpu_memory_utilization 0.87 \
2221--trust-remote-code \
23- --enforce-eager \
2422--no-enable-prefix-caching \
2523--port 7800 \
2624--block-size 128 \
@@ -41,14 +39,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
4139
4240Prefiller2 Launch Command:
4341``` bash
44- export PYTHONHASHSEED=123456
4542export CUDA_VISIBLE_DEVICES=1
4643vllm serve /home/models/Qwen2.5-7B-Instruct \
4744--max-model-len 20000 \
4845--tensor-parallel-size 1 \
4946--gpu_memory_utilization 0.87 \
5047--trust-remote-code \
51- --enforce-eager \
5248--no-enable-prefix-caching \
5349--port 7801 \
5450--block-size 128 \
You can’t perform that action at this time.
0 commit comments