From 5d6749a39e0500258153decf4938415d952053d3 Mon Sep 17 00:00:00 2001 From: Bradley Date: Thu, 27 Nov 2025 15:58:38 +0800 Subject: [PATCH] Fix multiprocessing start method for CUDA compatibility in LMCache kv_cache_sharing_lmcache_v1.py Signed-off-by: Bradley --- examples/others/lmcache/README.md | 4 - .../lmcache/disagg_prefill_lmcache_v0.py | 144 ------------------ .../lmcache/kv_cache_sharing_lmcache_v1.py | 15 +- 3 files changed, 11 insertions(+), 152 deletions(-) delete mode 100644 examples/others/lmcache/disagg_prefill_lmcache_v0.py diff --git a/examples/others/lmcache/README.md b/examples/others/lmcache/README.md index 759be55d6f1c..38d99f806d7d 100644 --- a/examples/others/lmcache/README.md +++ b/examples/others/lmcache/README.md @@ -54,7 +54,3 @@ The main script generates several log files: ## 3. KV Cache Sharing The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances. - -## 4. Disaggregated Prefill in vLLM v0 - -The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0. diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py deleted file mode 100644 index 6669eb3fb3d3..000000000000 --- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py +++ /dev/null @@ -1,144 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This file demonstrates the example usage of disaggregated prefilling -with LMCache. -We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), -and launch an additional LMCache server. -KV cache is transferred in the following manner: -vLLM prefill node -> LMCache server -> vLLM decode node. - -Note that `pip install lmcache` is needed to run this example. -Learn more about LMCache in https://github.com/LMCache/LMCache. -""" - -import os -import subprocess -import time -from multiprocessing import Event, Process - -from lmcache.experimental.cache_engine import LMCacheEngineBuilder -from lmcache.integration.vllm.utils import ENGINE_NAME - -from vllm import LLM, SamplingParams -from vllm.config import KVTransferConfig - -# LMCache-related environment variables -# The port to start LMCache server -port = 8100 -# Use experimental features in LMCache -os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" -# LMCache is set to use 256 tokens per chunk -os.environ["LMCACHE_CHUNK_SIZE"] = "256" -# Disable local CPU backend in LMCache -os.environ["LMCACHE_LOCAL_CPU"] = "False" -# Set local CPU memory buffer limit to 5.0 GB -os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" -# Set the remote URL for LMCache server -os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}" -# Set the serializer/deserializer between vllm and LMCache server -# `naive` indicates using raw bytes of the tensor without any compression -os.environ["LMCACHE_REMOTE_SERDE"] = "naive" - -prompts = [ - "Hello, how are you?" * 1000, -] - - -def run_prefill(prefill_done, prompts): - # We use GPU 0 for prefill node. - os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - - ktc = KVTransferConfig( - kv_connector="LMCacheConnector", - kv_role="kv_producer", - kv_rank=0, - kv_parallel_size=2, - ) - # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB - # memory. Reduce the value if your GPU has less memory. - llm = LLM( - model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True, - ) - - # llm.generate(prompts, sampling_params) - outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - print(f"Generated text: {generated_text!r}") - print("Prefill node is finished.") - prefill_done.set() - - # Clean up lmcache backend - LMCacheEngineBuilder.destroy(ENGINE_NAME) - - -def run_decode(prefill_done, prompts, timeout=1): - # We use GPU 1 for decode node. - os.environ["CUDA_VISIBLE_DEVICES"] = "1" - - sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - - ktc = KVTransferConfig( - kv_connector="LMCacheConnector", - kv_role="kv_consumer", - kv_rank=1, - kv_parallel_size=2, - ) - # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB - # of memory. Reduce the value if your GPU has less memory. - llm = LLM( - model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True, - ) - - print("Waiting for prefill node to finish...") - prefill_done.wait() - time.sleep(timeout) - - outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - print(f"Generated text: {generated_text!r}") - - # Clean up lmcache backend - LMCacheEngineBuilder.destroy(ENGINE_NAME) - - -def run_lmcache_server(port): - server_proc = subprocess.Popen( - ["python", "-m", "lmcache.experimental.server", "localhost", str(port)] - ) - return server_proc - - -def main(): - prefill_done = Event() - prefill_process = Process(target=run_prefill, args=(prefill_done, prompts)) - decode_process = Process(target=run_decode, args=(prefill_done, prompts)) - lmcache_server_process = run_lmcache_server(port) - - # Start prefill node - prefill_process.start() - - # Start decode node - decode_process.start() - - # Clean up the processes - decode_process.join() - prefill_process.terminate() - lmcache_server_process.terminate() - lmcache_server_process.wait() - - -if __name__ == "__main__": - main() diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py index 46e2d903d4be..961246f5de28 100644 --- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py +++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py @@ -12,10 +12,10 @@ Learn more about LMCache in https://github.com/LMCache/LMCache. """ +import multiprocessing import os import subprocess import time -from multiprocessing import Event, Process from lmcache.integration.vllm.utils import ENGINE_NAME from lmcache.v1.cache_engine import LMCacheEngineBuilder @@ -111,9 +111,16 @@ def run_lmcache_server(port): def main(): - store_done = Event() - store_process = Process(target=run_store, args=(store_done, prompts)) - retrieve_process = Process(target=run_retrieve, args=(store_done, prompts)) + # Set multiprocessing start method to 'spawn' for CUDA compatibility + # This is required when using CUDA with multiprocessing + if multiprocessing.get_start_method(allow_none=True) != "spawn": + multiprocessing.set_start_method("spawn", force=True) + + # Use spawn context to create processes + ctx = multiprocessing.get_context("spawn") + store_done = ctx.Event() + store_process = ctx.Process(target=run_store, args=(store_done, prompts)) + retrieve_process = ctx.Process(target=run_retrieve, args=(store_done, prompts)) lmcache_server_process = run_lmcache_server(port) # Start KV cache store process