Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions examples/splitwise/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Run the Examples on NVIDIA CUDA GPU

## Prepare the Environment
Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) to pull the docker image, such as:
```
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.3.0
```

In the docker container, the [NVIDIA MLNX_OFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/) and [Redis](https://redis.io/) are pre-installed.

## Build and install FastDeploy

```
git clone https://github.com/PaddlePaddle/FastDeploy
cd FastDeploy

export ENABLE_FD_RDMA=1

# Argument 1: Whether to build wheel package (1 for yes, 0 for compile only)
# Argument 2: Python interpreter path
# Argument 3: Whether to compile CPU inference operators
# Argument 4: Target GPU architectures
bash build.sh 1 python false [80,90]
```

## Run the Examples

Run the shell scripts in this directory, ```bash start_v0_tp1.sh``` or ```bash start_v1_tp1.sh```

Note that, there are two methods for splitwise deployment:
* v0: using splitwise_scheduler or dp_scheduler, in which the requests are scheduled in the engine.
* v1: using router, in which the requests are scheduled in the router.

# Run the Examples on Kunlunxin XPU

Coming soon...
34 changes: 14 additions & 20 deletions examples/splitwise/start_mixed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,35 @@ set -e

# Test mixed server + router

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"

export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=0
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh

S1_PORT=52400
S2_PORT=52500
ROUTER_PORT=52600

ports=(
$S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3))
$S2_PORT $((S2_PORT + 1)) $((S2_PORT + 2)) $((S2_PORT + 3))
$ROUTER_PORT
)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start router
export FD_LOG_DIR="log_router"
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.router.launch \
--port ${ROUTER_PORT} \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

# start modelserver 0
export CUDA_VISIBLE_DEVICES=0
Expand All @@ -53,7 +47,6 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

wait_for_health ${S1_PORT}

Expand All @@ -76,12 +69,13 @@ wait_for_health ${S2_PORT}

# send request
sleep 10 # make sure server is registered to router
echo "send request..."
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "hello"}
],
"max_tokens": 20,
"stream": true
"stream": false
}'
32 changes: 15 additions & 17 deletions examples/splitwise/start_v0_tp1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,8 @@ set -e
# v0: using splitwise_scheduler or dp_scheduler
# v1: using local_scheduler + router

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"

export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=1
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
Expand All @@ -37,10 +23,21 @@ fi

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh

P_PORT=52400
D_PORT=52500
REDIS_PORT=56388
REDIS_PORT="${REDIS_PORT:-56388}"

ports=(
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
$D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
$REDIS_PORT
)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start redis
if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
Expand Down Expand Up @@ -104,12 +101,13 @@ wait_for_health ${D_PORT}

# send request
sleep 10 # make sure server is registered to router
echo "send request..."
curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "hello"}
],
"max_tokens": 20,
"stream": true
"stream": false
}'
111 changes: 0 additions & 111 deletions examples/splitwise/start_v0_tp2.sh

This file was deleted.

35 changes: 16 additions & 19 deletions examples/splitwise/start_v1_tp1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,8 @@ set -e
# v0: using splitwise_scheduler or dp_scheduler
# v1: using local_scheduler + router

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"

export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=1
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
Expand All @@ -37,10 +23,21 @@ fi

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh

P_PORT=52400
D_PORT=52500
ROUTER_PORT=52600
ROUTER_PORT=52700

ports=(
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
$D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
$ROUTER_PORT
)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start router
export FD_LOG_DIR="log_router"
Expand All @@ -50,7 +47,6 @@ nohup python -m fastdeploy.router.launch \
--port ${ROUTER_PORT} \
--splitwise \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

# start prefill
export CUDA_VISIBLE_DEVICES=0
Expand Down Expand Up @@ -97,12 +93,13 @@ wait_for_health ${D_PORT}

# send request
sleep 10 # make sure server is registered to router
echo "send request..."
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "hello"}
],
"max_tokens": 20,
"stream": true
"max_tokens": 100,
"stream": false
}'
Loading
Loading