Skip to content

Commit 4f3ca7e

Browse files
committed
release v0.1.2
2 parents debb233 + 24e6bfa commit 4f3ca7e

File tree

21 files changed

+579
-66
lines changed

21 files changed

+579
-66
lines changed

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ include LICENSE
22
include pyproject.toml
33
include CMakeLists.txt
44
include requirements.txt
5+
include setup.py
56

67
recursive-include examples *
78
recursive-include benchmarks *

docs/source/user-guide/pd-disaggregation/1p1d.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,12 @@ For illustration purposes, let us take GPU as an example and assume the model us
1313
### Run prefill server
1414
Prefiller Launch Command:
1515
```bash
16-
export PYTHONHASHSEED=123456
1716
export CUDA_VISIBLE_DEVICES=0
1817
vllm serve /home/models/Qwen2.5-7B-Instruct \
1918
--max-model-len 20000 \
2019
--tensor-parallel-size 1 \
2120
--gpu_memory_utilization 0.87 \
2221
--trust-remote-code \
23-
--enforce-eager \
2422
--no-enable-prefix-caching \
2523
--port 7800 \
2624
--block-size 128 \
@@ -42,14 +40,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
4240
### Run decode server
4341
Decoder Launch Command:
4442
```bash
45-
export PYTHONHASHSEED=123456
4643
export CUDA_VISIBLE_DEVICES=0
4744
vllm serve /home/models/Qwen2.5-7B-Instruct \
4845
--max-model-len 20000 \
4946
--tensor-parallel-size 1 \
5047
--gpu_memory_utilization 0.87 \
5148
--trust-remote-code \
52-
--enforce-eager \
5349
--no-enable-prefix-caching \
5450
--port 7801 \
5551
--block-size 128 \

docs/source/user-guide/pd-disaggregation/npgd.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,12 @@ For illustration purposes, let us assume that the model used is Qwen2.5-7B-Instr
1919
### Run prefill server
2020
Prefiller Launch Command:
2121
```bash
22-
export PYTHONHASHSEED=123456
2322
export ASCEND_RT_VISIBLE_DEVICES=0
2423
vllm serve /home/models/Qwen2.5-7B-Instruct \
2524
--max-model-len 20000 \
2625
--tensor-parallel-size 1 \
2726
--gpu_memory_utilization 0.87 \
2827
--trust-remote-code \
29-
--enforce-eager \
3028
--no-enable-prefix-caching \
3129
--port 7800 \
3230
--block-size 128 \
@@ -49,14 +47,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
4947
### Run decode server
5048
Decoder Launch Command:
5149
```bash
52-
export PYTHONHASHSEED=123456
5350
export CUDA_VISIBLE_DEVICES=0
5451
vllm serve /home/models/Qwen2.5-7B-Instruct \
5552
--max-model-len 20000 \
5653
--tensor-parallel-size 1 \
5754
--gpu_memory_utilization 0.87 \
5855
--trust-remote-code \
59-
--enforce-eager \
6056
--no-enable-prefix-caching \
6157
--port 7801 \
6258
--block-size 128 \

docs/source/user-guide/pd-disaggregation/xpyd.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,12 @@ For illustration purposes, let us take GPU as an example and assume the model us
1313
### Run prefill servers
1414
Prefiller1 Launch Command:
1515
```bash
16-
export PYTHONHASHSEED=123456
1716
export CUDA_VISIBLE_DEVICES=0
1817
vllm serve /home/models/Qwen2.5-7B-Instruct \
1918
--max-model-len 20000 \
2019
--tensor-parallel-size 1 \
2120
--gpu_memory_utilization 0.87 \
2221
--trust-remote-code \
23-
--enforce-eager \
2422
--no-enable-prefix-caching \
2523
--port 7800 \
2624
--block-size 128 \
@@ -41,14 +39,12 @@ vllm serve /home/models/Qwen2.5-7B-Instruct \
4139

4240
Prefiller2 Launch Command:
4341
```bash
44-
export PYTHONHASHSEED=123456
4542
export CUDA_VISIBLE_DEVICES=1
4643
vllm serve /home/models/Qwen2.5-7B-Instruct \
4744
--max-model-len 20000 \
4845
--tensor-parallel-size 1 \
4946
--gpu_memory_utilization 0.87 \
5047
--trust-remote-code \
51-
--enforce-eager \
5248
--no-enable-prefix-caching \
5349
--port 7801 \
5450
--block-size 128 \

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@ build-backend = "setuptools.build_meta"
55
[project]
66
name = "uc-manager"
77
authors = [{name = "UCM Team"}]
8-
license = "MIT"
9-
license-files = ["LICENSE"]
8+
license = { file="LICENSE" }
109
readme = "README.md"
1110
description = "Persist and reuse KV Cache to speedup your LLM."
1211
requires-python = ">=3.10"

setup.py

Lines changed: 61 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@
2323
#
2424

2525
import os
26+
import shutil
2627
import subprocess
2728
import sys
2829
import sysconfig
30+
import warnings
2931
from glob import glob
3032

3133
import pybind11
@@ -34,6 +36,12 @@
3436
from setuptools import Extension, find_packages, setup
3537
from setuptools.command.build_ext import build_ext
3638

39+
# Suppress warnings about packages absent from packages configuration
40+
# These are expected for C++ source directories, test directories, etc.
41+
warnings.filterwarnings(
42+
"ignore", message=".*Package.*is absent from the `packages` configuration.*"
43+
)
44+
3745
ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
3846
PLATFORM = os.getenv("PLATFORM")
3947

@@ -45,15 +53,11 @@ def _enable_sparse() -> bool:
4553

4654

4755
def _is_cuda() -> bool:
48-
return PLATFORM == "cuda"
49-
56+
return PLATFORM == "cuda" or (hasattr(torch, "cuda") and torch.cuda.is_available())
5057

51-
def _is_npu() -> bool:
52-
return PLATFORM == "ascend"
5358

54-
55-
def _is_musa() -> bool:
56-
return PLATFORM == "musa"
59+
def _is_maca() -> bool:
60+
return PLATFORM == "maca"
5761

5862

5963
class CMakeExtension(Extension):
@@ -67,6 +71,8 @@ def run(self):
6771
for ext in self.extensions:
6872
self.build_cmake(ext)
6973

74+
self._copy_so_files_to_build_lib()
75+
7076
def build_cmake(self, ext: CMakeExtension):
7177
build_dir = self.build_temp
7278
os.makedirs(build_dir, exist_ok=True)
@@ -93,15 +99,8 @@ def build_cmake(self, ext: CMakeExtension):
9399

94100
if _is_cuda():
95101
cmake_args.append("-DRUNTIME_ENVIRONMENT=cuda")
96-
elif _is_npu():
97-
cmake_args.append("-DRUNTIME_ENVIRONMENT=ascend")
98-
elif _is_musa():
99-
cmake_args.append("-DRUNTIME_ENVIRONMENT=musa")
100102
else:
101-
raise RuntimeError(
102-
"No supported accelerator found. "
103-
"Please ensure either CUDA/MUSA or NPU is available."
104-
)
103+
cmake_args.append("-DRUNTIME_ENVIRONMENT=ascend")
105104

106105
if _enable_sparse():
107106
cmake_args.append("-DBUILD_UCM_SPARSE=ON")
@@ -119,33 +118,58 @@ def build_cmake(self, ext: CMakeExtension):
119118
cwd=build_dir,
120119
)
121120

121+
def _copy_so_files_to_build_lib(self):
122+
"""Copy .so files from source directories to build_lib for installation."""
123+
if not hasattr(self, "build_lib") or not self.build_lib:
124+
return
122125

123-
def _get_packages():
124-
"""Discover Python packages, optionally filtering out sparse-related ones."""
125-
packages = find_packages()
126-
if not _enable_sparse():
127-
packages = [pkg for pkg in packages if not pkg.startswith("ucm.sparse")]
128-
return packages
126+
packages = _get_packages()
127+
copied_count = 0
129128

129+
for package in packages:
130+
# Source directory where CMake outputs .so files
131+
source_package_dir = os.path.join(ROOT_DIR, package.replace(".", os.sep))
130132

131-
def _get_package_data_with_so(packages=None):
132-
"""Automatically discover all packages and include .so files."""
133-
if packages is None:
134-
packages = _get_packages()
135-
package_data = {}
133+
# Destination in build_lib
134+
build_package_dir = os.path.join(
135+
self.build_lib, package.replace(".", os.sep)
136+
)
137+
138+
# Find all .so files in the source package directory
139+
so_files = glob(os.path.join(source_package_dir, "*.so"))
140+
141+
if so_files:
142+
# Ensure destination directory exists
143+
os.makedirs(build_package_dir, exist_ok=True)
144+
145+
# Copy each .so file
146+
for so_file in so_files:
147+
dest_file = os.path.join(
148+
build_package_dir, os.path.basename(so_file)
149+
)
150+
shutil.copy2(so_file, dest_file)
151+
copied_count += 1
152+
print(
153+
f"[INFO] Copied {os.path.basename(so_file)} to {build_package_dir}"
154+
)
155+
156+
if copied_count > 0:
157+
print(f"[INFO] Successfully copied {copied_count} .so file(s) to build_lib")
158+
else:
159+
print(
160+
"[WARNING] No .so files found to copy. Extensions may not have been built."
161+
)
136162

137-
for package in packages:
138-
# Convert package name to directory path
139-
package_dir = os.path.join(ROOT_DIR, package.replace(".", os.sep))
140163

141-
# Check if this package directory contains .so files
142-
so_files = glob(os.path.join(package_dir, "*.so"))
143-
if so_files:
144-
package_data[package] = ["*.so"]
145-
print(f"[INFO] Including .so files for package: {package}")
164+
def _get_packages():
165+
"""Discover Python packages, optionally filtering out sparse-related ones."""
166+
sparse_enabled = _enable_sparse()
167+
exclude_patterns = []
168+
if not sparse_enabled:
169+
exclude_patterns.append("ucm.sparse*")
146170

147-
print(f"[INFO] Package data: {package_data}")
148-
return package_data
171+
packages = find_packages(exclude=exclude_patterns)
172+
return packages
149173

150174

151175
ext_modules = []
@@ -155,13 +179,12 @@ def _get_package_data_with_so(packages=None):
155179

156180
setup(
157181
name="uc-manager",
158-
version="0.1.1",
182+
version="0.1.2",
159183
description="Unified Cache Management",
160184
author="Unified Cache Team",
161185
packages=packages,
162186
python_requires=">=3.10",
163187
ext_modules=ext_modules,
164188
cmdclass={"build_ext": CMakeBuild},
165-
package_data=_get_package_data_with_so(packages),
166189
zip_safe=False,
167190
)

ucm/integration/vllm/ucm_connector.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from vllm.distributed.parallel_state import get_tp_group, get_world_group
1717
from vllm.platforms import current_platform
1818
from vllm.v1.core.sched.output import SchedulerOutput
19-
from vllm.v1.request import Request
2019

2120
from ucm.logger import init_logger
2221
from ucm.shared.metrics import ucmmonitor
@@ -29,6 +28,7 @@
2928
from vllm.attention.backends.abstract import AttentionMetadata
3029
from vllm.forward_context import ForwardContext
3130
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
31+
from vllm.v1.request import Request
3232

3333
logger = init_logger(__name__)
3434

@@ -178,11 +178,15 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
178178
self.metrics_config,
179179
)
180180
self.monitor = ucmmonitor.StatsMonitor.get_instance()
181-
self.synchronize = (
182-
torch.cuda.synchronize
183-
if current_platform.is_cuda_alike()
184-
else torch.npu.synchronize
185-
)
181+
182+
self.synchronize = (
183+
torch.cuda.synchronize
184+
if current_platform.is_cuda_alike()
185+
else torch.npu.synchronize
186+
)
187+
188+
# invlalid block ids due to load errors
189+
self._invalid_block_ids: set[int] = set()
186190

187191
def generate_hash(self, block_size: int, request: "Request") -> list[str]:
188192
token_ids = request.all_token_ids
@@ -513,6 +517,9 @@ def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
513517
# TODO error handling
514518
if self.global_rank == 0 or not self.load_only_first_rank:
515519
if self.store.wait(task) != 0:
520+
self._invalid_block_ids.update(
521+
metadata.request_meta[request_id].load_block_ids[1]
522+
)
516523
logger.error(f"request {request_id} load kv cache failed.")
517524
if self.load_only_first_rank:
518525
self._broadcast(req_broadcast_addr[request_id])
@@ -552,7 +559,9 @@ def wait_for_save(self) -> None:
552559
# TODO support PP
553560
if (self.is_mla or self.is_dsa) and self.global_rank != 0:
554561
return
555-
if self.metrics_config:
562+
if self.metrics_config or current_platform.device_type == "npu":
563+
# When use vllm_ascend, we should add synchronize here, otherwise accuracy problem will raise
564+
# This has already been fixed in the latest main branch of vllm_ascend, so synchronize will no longer be needed in future versions.
556565
self.synchronize()
557566

558567
metadata = self._get_connector_metadata()
@@ -626,6 +635,18 @@ def wait_for_save(self) -> None:
626635
def clear_connector_metadata(self) -> None:
627636
super().clear_connector_metadata()
628637

638+
def get_block_ids_with_load_errors(self) -> set[int]:
639+
"""
640+
Get the set of block IDs that failed to load.
641+
642+
Returns:
643+
Set of block IDs that encountered load errors.
644+
Empty set if no load errors occurred.
645+
"""
646+
res = self._invalid_block_ids
647+
self._invalid_block_ids = set()
648+
return res
649+
629650

630651
class UCMLayerWiseConnector(UCMDirectConnector):
631652
"""
@@ -866,3 +887,13 @@ def clear_connector_metadata(self) -> None:
866887
after the model execution.
867888
"""
868889
self.connector.clear_connector_metadata()
890+
891+
def get_block_ids_with_load_errors(self) -> set[int]:
892+
"""
893+
Get the set of block IDs that failed to load.
894+
895+
Returns:
896+
Set of block IDs that encountered load errors.
897+
Empty set if no load errors occurred.
898+
"""
899+
return self.connector.get_block_ids_with_load_errors()

ucm/shared/trans/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
if(RUNTIME_ENVIRONMENT STREQUAL "ascend")
22
add_subdirectory(ascend)
33
endif()
4+
if(RUNTIME_ENVIRONMENT STREQUAL "maca")
5+
add_subdirectory(maca)
6+
endif()
47
if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
58
add_subdirectory(cuda)
69
endif()

0 commit comments

Comments
 (0)