ModelEngine-Group
diff --git a/‎MANIFEST.in‎
Lines changed: 3 additions & 6 deletions b/‎MANIFEST.in‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/getting-started/quickstart_vllm.md‎
Lines changed: 5 additions & 4 deletions b/‎docs/source/getting-started/quickstart_vllm.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/source/getting-started/quickstart_vllm_ascend.md‎
Lines changed: 5 additions & 4 deletions b/‎docs/source/getting-started/quickstart_vllm_ascend.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/source/user-guide/prefix-cache/nfs_store.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/source/user-guide/prefix-cache/nfs_store.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 7 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎setup.py‎
Lines changed: 34 additions & 101 deletions b/‎setup.py‎
Lines changed: 34 additions & 101 deletions
@@ -1,7 +1,4 @@
-include LICENSE
-include pyproject.toml
 include CMakeLists.txt
-include requirements.txt
-
-recursive-include examples *
-recursive-include benchmarks *
+graft ucm
+graft examples
+graft benchmarks
@@ -68,7 +68,7 @@ in either a local filesystem for single-machine scenarios or through NFS mount p
 
 ## Quick Start
 
-please refer to [Quick Start](https://ucm.readthedocs.io/en/latest/getting-started/quick_start.html).
+please refer to [Quick Start for vLLM](https://ucm.readthedocs.io/en/latest/getting-started/quickstart_vllm.html) and [Quick Start for vLLM-Ascend](https://ucm.readthedocs.io/en/latest/getting-started/quickstart_vllm_ascend.html).
 
 ---
 
 
@@ -135,7 +135,6 @@ Then run following commands:
 ```bash
 cd examples/
 # Change the model path to your own model path
-export MODEL_PATH=/home/models/Qwen2.5-14B-Instruct
 python offline_inference.py
 ```
 
@@ -163,12 +162,14 @@ vllm serve Qwen/Qwen2.5-14B-Instruct \
 --kv-transfer-config \
 '{
     "kv_connector": "UCMConnector",
+    "kv_connector_module_path": "ucm.integration.vllm.ucm_connector",
     "kv_role": "kv_both",
-    "kv_connector_extra_config": {"UCM_CONFIG_FILE": "/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml"}
+    "kv_connector_extra_config": {"UCM_CONFIG_FILE": "/workspace/unified-cache-management/examples/ucm_config_example.yaml"}
 }'
 ```
+**⚠️ The parameter `--no-enable-prefix-caching` is for SSD performance testing, please remove it for production.**
 
-**⚠️ Make sure to replace `"/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml"` with your actual config file path.**
+**⚠️ Make sure to replace `"/workspace/unified-cache-management/examples/ucm_config_example.yaml"` with your actual config file path.**
 
 
 If you see log as below:
@@ -187,7 +188,7 @@ After successfully started the vLLM server，You can interact with the API as fo
 curl http://localhost:7800/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "/home/models/Qwen2.5-14B-Instruct",
+    "model": "Qwen/Qwen2.5-14B-Instruct",
     "prompt": "You are a highly specialized assistant whose mission is to faithfully reproduce English literary texts verbatim, without any deviation, paraphrasing, or omission. Your primary responsibility is accuracy: every word, every punctuation mark, and every line must appear exactly as in the original source. Core Principles: Verbatim Reproduction: If the user asks for a passage, you must output the text word-for-word. Do not alter spelling, punctuation, capitalization, or line breaks. Do not paraphrase, summarize, modernize, or \"improve\" the language. Consistency: The same input must always yield the same output. Do not generate alternative versions or interpretations. Clarity of Scope: Your role is not to explain, interpret, or critique. You are not a storyteller or commentator, but a faithful copyist of English literary and cultural texts. Recognizability: Because texts must be reproduced exactly, they will carry their own cultural recognition. You should not add labels, introductions, or explanations before or after the text. Coverage: You must handle passages from classic literature, poetry, speeches, or cultural texts. Regardless of tone—solemn, visionary, poetic, persuasive—you must preserve the original form, structure, and rhythm by reproducing it precisely. Success Criteria: A human reader should be able to compare your output directly with the original and find zero differences. The measure of success is absolute textual fidelity. Your function can be summarized as follows: verbatim reproduction only, no paraphrase, no commentary, no embellishment, no omission. Please reproduce verbatim the opening sentence of the United States Declaration of Independence (1776), starting with \"When in the Course of human events\" and continuing word-for-word without paraphrasing.",
     "max_tokens": 100,
     "temperature": 0
 
@@ -103,7 +103,6 @@ Then run following commands:
 ```bash
 cd examples/
 # Change the model path to your own model path
-export MODEL_PATH=/home/models/Qwen2.5-14B-Instruct
 python offline_inference.py
 ```
 
@@ -131,12 +130,14 @@ vllm serve Qwen/Qwen2.5-14B-Instruct \
 --kv-transfer-config \
 '{
     "kv_connector": "UCMConnector",
+    "kv_connector_module_path": "ucm.integration.vllm.ucm_connector",
     "kv_role": "kv_both",
-    "kv_connector_extra_config": {"UCM_CONFIG_FILE": "/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml"}
+    "kv_connector_extra_config": {"UCM_CONFIG_FILE": "/workspace/unified-cache-management/examples/ucm_config_example.yaml"}
 }'
 ```
+**⚠️ The parameter `--no-enable-prefix-caching` is for SSD performance testing, please remove it for production.**
 
-**⚠️ Make sure to replace `"/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml"` with your actual config file path.**
+**⚠️ Make sure to replace `"/workspace/unified-cache-management/examples/ucm_config_example.yaml"` with your actual config file path.**
 
 
 If you see log as below:
@@ -155,7 +156,7 @@ After successfully started the vLLM server，You can interact with the API as fo
 curl http://localhost:7800/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "/home/models/Qwen2.5-14B-Instruct",
+    "model": "Qwen/Qwen2.5-14B-Instruct",
     "prompt": "You are a highly specialized assistant whose mission is to faithfully reproduce English literary texts verbatim, without any deviation, paraphrasing, or omission. Your primary responsibility is accuracy: every word, every punctuation mark, and every line must appear exactly as in the original source. Core Principles: Verbatim Reproduction: If the user asks for a passage, you must output the text word-for-word. Do not alter spelling, punctuation, capitalization, or line breaks. Do not paraphrase, summarize, modernize, or \"improve\" the language. Consistency: The same input must always yield the same output. Do not generate alternative versions or interpretations. Clarity of Scope: Your role is not to explain, interpret, or critique. You are not a storyteller or commentator, but a faithful copyist of English literary and cultural texts. Recognizability: Because texts must be reproduced exactly, they will carry their own cultural recognition. You should not add labels, introductions, or explanations before or after the text. Coverage: You must handle passages from classic literature, poetry, speeches, or cultural texts. Regardless of tone—solemn, visionary, poetic, persuasive—you must preserve the original form, structure, and rhythm by reproducing it precisely. Success Criteria: A human reader should be able to compare your output directly with the original and find zero differences. The measure of success is absolute textual fidelity. Your function can be summarized as follows: verbatim reproduction only, no paraphrase, no commentary, no embellishment, no omission. Please reproduce verbatim the opening sentence of the United States Declaration of Independence (1776), starting with \"When in the Course of human events\" and continuing word-for-word without paraphrasing.",
     "max_tokens": 100,
     "temperature": 0
 
@@ -109,8 +109,6 @@ Explanation:
 
 ## Launching Inference
 
-### Offline Inference
-
 In this guide, we describe **online inference** using vLLM with the UCM connector, deployed as an OpenAI-compatible server. For best performance with UCM, it is recommended to set `block_size` to 128.
 
 To start the vLLM server with the Qwen/Qwen2.5-14B-Instruct model, run:
@@ -129,6 +127,7 @@ vllm serve Qwen/Qwen2.5-14B-Instruct \
 '{
     "kv_connector": "UCMConnector",
     "kv_role": "kv_both",
+    "kv_connector_module_path": "ucm.integration.vllm.ucm_connector",
     "kv_connector_extra_config": {"UCM_CONFIG_FILE": "/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml"}
 }'
 ```
 
@@ -1,12 +1,15 @@
 [build-system]
-requires = ["setuptools>=45", "wheel", "cmake", "torch", "pybind11"]
+requires = [
+    "setuptools>=64",
+    "cmake>=3.18",
+    "wheel",
+]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "uc-manager"
-authors = [{name = "UCM Team"}]
-license = "MIT"
-license-files = ["LICENSE"]
+authors = [{name = "Unified Cache Team"}]
+license = { file="LICENSE" }
 readme = "README.md"
 description = "Persist and reuse KV Cache to speedup your LLM."
 requires-python = ">=3.10"
 
@@ -25,150 +25,83 @@
 import os
 import subprocess
 import sys
-import sysconfig
-from glob import glob
 
-import pybind11
-import torch
-import torch.utils.cpp_extension
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 
 ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
 PLATFORM = os.getenv("PLATFORM")
-
 ENABLE_SPARSE = os.getenv("ENABLE_SPARSE")
 
 
 def _enable_sparse() -> bool:
     return ENABLE_SPARSE is not None and ENABLE_SPARSE.lower() == "true"
 
 
-def _is_cuda() -> bool:
-    return PLATFORM == "cuda"
-
-
-def _is_npu() -> bool:
-    return PLATFORM == "ascend"
-
-
-def _is_musa() -> bool:
-    return PLATFORM == "musa"
-
-
-def _is_maca() -> bool:
-    return PLATFORM == "maca"
-
-
 class CMakeExtension(Extension):
-    def __init__(self, name: str, sourcedir: str = ""):
+    def __init__(self, name: str, source_dir: str = ""):
         super().__init__(name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
+        self.cmake_file_path = os.path.abspath(source_dir)
 
 
 class CMakeBuild(build_ext):
     def run(self):
+        build_dir = os.path.abspath(self.build_temp)
+        os.makedirs(build_dir, exist_ok=True)
+
         for ext in self.extensions:
             self.build_cmake(ext)
 
     def build_cmake(self, ext: CMakeExtension):
-        build_dir = self.build_temp
-        os.makedirs(build_dir, exist_ok=True)
+        build_dir = os.path.abspath(self.build_temp)
+        install_dir = os.path.abspath(self.build_lib)
 
         cmake_args = [
-            "cmake",
             "-DCMAKE_BUILD_TYPE=Release",
             f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_INSTALL_PREFIX={install_dir}",
         ]
 
-        torch_cmake_prefix = torch.utils.cmake_prefix_path
-        pybind11_cmake_dir = pybind11.get_cmake_dir()
-
-        cmake_prefix_paths = [torch_cmake_prefix, pybind11_cmake_dir]
-        cmake_args.append(f"-DCMAKE_PREFIX_PATH={';'.join(cmake_prefix_paths)}")
-
-        torch_includes = torch.utils.cpp_extension.include_paths()
-        python_include = sysconfig.get_path("include")
-        pybind11_include = pybind11.get_include()
-
-        all_includes = torch_includes + [python_include, pybind11_include]
-        cmake_include_string = ";".join(all_includes)
-        cmake_args.append(f"-DEXTERNAL_INCLUDE_DIRS={cmake_include_string}")
-
-        if _is_cuda():
-            cmake_args.append("-DRUNTIME_ENVIRONMENT=cuda")
-        elif _is_npu():
-            cmake_args.append("-DRUNTIME_ENVIRONMENT=ascend")
-        elif _is_musa():
-            cmake_args.append("-DRUNTIME_ENVIRONMENT=musa")
-        elif _is_maca():
-            cmake_args.append("-DRUNTIME_ENVIRONMENT=maca")
-            cmake_args.append("-DBUILD_UCM_SPARSE=OFF")
-        else:
-            raise RuntimeError(
-                "No supported accelerator found. "
-                "Please ensure either CUDA/MUSA or NPU is available."
-            )
-
         if _enable_sparse():
-            cmake_args.append("-DBUILD_UCM_SPARSE=ON")
-
-        cmake_args.append(ext.sourcedir)
+            cmake_args += ["-DBUILD_UCM_SPARSE=ON"]
+
+        match PLATFORM:
+            case "cuda":
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=cuda"]
+            case "ascend":
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=ascend"]
+            case "musa":
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=musa"]
+            case "maca":
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=maca"]
+                cmake_args += ["-DBUILD_UCM_SPARSE=OFF"]
+            case _:
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=simu"]
+                cmake_args += ["-DBUILD_UCM_SPARSE=OFF"]
 
-        print(f"[INFO] Building {ext.name} module with CMake")
-        print(f"[INFO] Source directory: {ext.sourcedir}")
-        print(f"[INFO] Build directory: {build_dir}")
-        print(f"[INFO] CMake command: {' '.join(cmake_args)}")
-
-        subprocess.check_call(cmake_args, cwd=build_dir)
+        subprocess.check_call(
+            ["cmake", *cmake_args, ext.cmake_file_path], cwd=build_dir
+        )
         subprocess.check_call(
             ["cmake", "--build", ".", "--config", "Release", "--", "-j8"],
             cwd=build_dir,
         )
 
+        subprocess.check_call(
+            ["cmake", "--install", ".", "--config", "Release", "--component", "ucm"],
+            cwd=build_dir,
+        )
 
-def _get_packages():
-    """Discover Python packages, optionally filtering out sparse-related ones."""
-    packages = find_packages()
-    if not _enable_sparse():
-        packages = [pkg for pkg in packages if not pkg.startswith("ucm.sparse")]
-    return packages
-
-
-def _get_package_data_with_so(packages=None):
-    """Automatically discover all packages and include .so files."""
-    if packages is None:
-        packages = _get_packages()
-    package_data = {}
-
-    for package in packages:
-        # Convert package name to directory path
-        package_dir = os.path.join(ROOT_DIR, package.replace(".", os.sep))
-
-        # Check if this package directory contains .so files
-        so_files = glob(os.path.join(package_dir, "*.so"))
-        if so_files:
-            package_data[package] = ["*.so"]
-            print(f"[INFO] Including .so files for package: {package}")
-
-    print(f"[INFO] Package data: {package_data}")
-    return package_data
-
-
-ext_modules = []
-ext_modules.append(CMakeExtension(name="ucm", sourcedir=ROOT_DIR))
-
-packages = _get_packages()
 
 setup(
     name="uc-manager",
-    version="0.1.1",
+    version="0.1.2",
     description="Unified Cache Management",
     author="Unified Cache Team",
-    packages=packages,
+    packages=find_packages(),
     python_requires=">=3.10",
-    ext_modules=ext_modules,
+    ext_modules=[CMakeExtension(name="ucm", source_dir=ROOT_DIR)],
     cmdclass={"build_ext": CMakeBuild},
-    package_data=_get_package_data_with_so(packages),
     zip_safe=False,
+    include_package_data=False,
 )