From 8a9610e62f534c1f17afa121122f9fd93aaa01bd Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 16 Jun 2025 20:28:05 +0000 Subject: [PATCH 01/33] Initial working implementation of a-LoRA. Co-authored-by: Greenewald Co-authored-by: Allison Li Signed-off-by: Thomas Parnell --- examples/alora/alora_server_testing.py | 67 ++++++++++++++++ examples/alora/alora_server_testing.sh | 46 +++++++++++ examples/alora/new_alora_testing.py | 74 +++++++++++++++++ vllm/envs.py | 5 ++ vllm/forward_context.py | 9 +++ vllm/lora/layers.py | 47 +++++++++-- vllm/lora/request.py | 2 + vllm/model_executor/layers/linear.py | 7 ++ vllm/v1/core/kv_cache_utils.py | 16 ++++ vllm/v1/core/sched/scheduler.py | 14 ++++ vllm/v1/engine/processor.py | 20 +++++ vllm/v1/worker/gpu_model_runner.py | 105 ++++++++++++++++++++++++- 12 files changed, 401 insertions(+), 11 deletions(-) create mode 100644 examples/alora/alora_server_testing.py create mode 100644 examples/alora/alora_server_testing.sh create mode 100644 examples/alora/new_alora_testing.py diff --git a/examples/alora/alora_server_testing.py b/examples/alora/alora_server_testing.py new file mode 100644 index 000000000000..e9616600a54d --- /dev/null +++ b/examples/alora/alora_server_testing.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# After starting server using "vllm serve --enable_lora --lora_modules..." + +import time + +from openai import OpenAI + +model_id = "ibm-granite/granite-3.2-8b-instruct" + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +BASE_NAME = "ibm-granite/granite-3.2-8b-instruct" +ALORA_NAME = "new_alora" # "ibm-granite/granite-3.2-8b-alora-uncertainty" +invocation_string = "<|start_of_role|>certainty<|end_of_role|>" + +################################################################### +prompts = [ + "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>", + "What is MIT?", + ( + "<|start_of_role|>user<|end_of_role|>What is the capital of " + "Massachusetts?<|end_of_text|>\n" + ), + "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>", + ( + "<|start_of_role|>user<|end_of_role|>What is the capital of " + "Massachusetts?<|end_of_text|>\n" + ), + "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>", +] + +# Base model call +outputs_base = client.completions.create( + model=BASE_NAME, prompt=prompts, temperature=0, max_tokens=600 +) + +choices = outputs_base.choices +generated_text = [] +for i in range(len(prompts)): + prompt = prompts[i] + + generated_text += [outputs_base.choices[i].text] + print(f"Prompt: {prompt!r}, Generated text: {generated_text[-1]!r}") + +prompts_alora = [ + x + y + "<|end_of_text|>\n" + invocation_string + for x, y in zip(prompts, generated_text) +] + +# Base model with aLoRA call +t0 = time.time() +alora_outputs = client.completions.create( + model=ALORA_NAME, prompt=prompts_alora, temperature=0, max_tokens=10 +) +t = time.time() - t0 +print(f"Time: {t}") +for i in range(len(prompts_alora)): + prompt = prompts_alora[i] + generated_text = alora_outputs.choices[i].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/alora/alora_server_testing.sh b/examples/alora/alora_server_testing.sh new file mode 100644 index 000000000000..49eb9c5612f3 --- /dev/null +++ b/examples/alora/alora_server_testing.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# More documentation: https://docs.vllm.ai/en/v0.8.3/serving/openai_compatible_server.html#vllm-serve +export VLLM_USE_V1="1" +# Specify base model (and optionally loras) to load in when starting the server. +vllm serve ibm-granite/granite-3.2-8b-instruct \ + --enable-lora \ + --lora-modules '{"name": "new_alora", "path": "/proj/dmfexp/statllm/users/kgreenewald/.cache/huggingface/models/hub/models--ibm-granite--granite-3.2-8b-alora-uncertainty/snapshots/6109ad88201426003e696d023ec67c19e7f3d444", "base_model_name": "ibm-granite/granite-3.2-8b-instruct"}' \ + --dtype bfloat16 \ + --max-lora-rank 64 \ + --enable-prefix-caching +#--no-enable-prefix-caching +# Check that the lora model is listed along with other models. +#curl localhost:8000/v1/models | jq . + +########################################### + +# A second option is to enable dynamic adapter loading instead of at start-up. +#export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True + +#curl -X POST http://localhost:8000/v1/load_lora_adapter \ +#-H "Content-Type: application/json" \ +#-d '{ +# "lora_name": "new_alora", +# "lora_path": "/path/to/new_alora" +#}' +# Should return "200 OK - Success: LoRA adapter 'new_alora' added successfully" + +# Example of dynamically unloading an adapter. +# curl -X POST http://localhost:8000/v1/unload_lora_adapter \ +# -H "Content-Type: application/json" \ +# -d '{ +# "lora_name": "new_alora" +# }' + +########################################### + +# Send a request using the new aLoRA +#curl http://localhost:8000/v1/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "new_alora", +# "prompt": ""What is MIT?"", +# "max_tokens": 600, +# "temperature": 0 +# }' | jq diff --git a/examples/alora/new_alora_testing.py b/examples/alora/new_alora_testing.py new file mode 100644 index 000000000000..5d3908ff8f24 --- /dev/null +++ b/examples/alora/new_alora_testing.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +import time + +import torch +from huggingface_hub import snapshot_download + +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +BASE_NAME = "ibm-granite/granite-3.2-8b-instruct" +ALORA_NAME = "ibm-granite/granite-3.2-8b-alora-uncertainty" +invocation_string = "<|start_of_role|>certainty<|end_of_role|>" + +os.environ["VLLM_USE_V1"] = "1" +os.environ["VLLM_V1_USE_DEMO_LOGGING"] = "1" + +# download your LoRA adapter to ~/.cache/huggingface/… +alora_path = snapshot_download(repo_id=ALORA_NAME) + +print(alora_path) +####################################### + + +llm = LLM( + model=BASE_NAME, + enable_lora=True, + enforce_eager=True, + dtype=torch.bfloat16, + enable_prefix_caching=True, # enable APC + max_lora_rank=64, + enable_chunked_prefill=False, +) + +prompts = [ + ( + "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>" + ), +] + +sampling_params = SamplingParams(temperature=0, max_tokens=600) + +outputsBase = llm.generate( + prompts, + sampling_params, +) +generated_text = [] +for output in outputsBase: + prompt = output.prompt + generated_text += [output.outputs[0].text] + print(f"Prompt: {prompt!r}, Generated text: {generated_text[-1]!r}") + +prompts_alora = [ + x + y + "<|end_of_text|>\n" + invocation_string + for x, y in zip(prompts, generated_text) +] + +sampling_params = SamplingParams(temperature=0, max_tokens=10) + +t0 = time.time() +outputs = llm.generate( + prompts_alora, + sampling_params, + lora_request=LoRARequest("UQ_adapter", 1, alora_path), +) +t = time.time() - t0 +print(f"Time: {t}") + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm/envs.py b/vllm/envs.py index 921052821ee3..92d985c79fc3 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -120,6 +120,7 @@ VLLM_USE_DEEP_GEMM: bool = False VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 + VLLM_V1_USE_DEMO_LOGGING: bool = True VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 @@ -835,6 +836,10 @@ def get_vllm_port() -> Optional[int]: "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")), + # Useful for demo + "VLLM_V1_USE_DEMO_LOGGING": + lambda: os.environ.get("VLLM_V1_USE_DEMO_LOGGING", "0") == "1", + # If set, allow insecure serialization using pickle. # This is useful for environments where it is deemed safe to use the # insecure method and it is needed for some reason. diff --git a/vllm/forward_context.py b/vllm/forward_context.py index dd55b19feeaf..144e3cc4a1d0 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -26,6 +26,12 @@ batchsize_forward_time: defaultdict = defaultdict(list) +@dataclass +class ALoRAMetadata: + k_offsets: torch.Tensor + query_start_locs: list[int] + + @dataclass class DPMetadata: max_tokens_across_dp_cpu: torch.Tensor @@ -94,6 +100,7 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None + alora_metadata: Optional[ALoRAMetadata] = None skip_cuda_graphs: bool = False @@ -116,6 +123,7 @@ def set_forward_context( num_tokens: Optional[int] = None, num_tokens_across_dp: Optional[torch.Tensor] = None, skip_cuda_graphs: bool = False, + alora_metadata: Optional[ALoRAMetadata] = None, ): """A context manager that stores the current forward context, can be attention metadata, etc. @@ -140,6 +148,7 @@ def set_forward_context( virtual_engine=virtual_engine, attn_metadata=attn_metadata, dp_metadata=dp_metadata, + alora_metadata=alora_metadata, skip_cuda_graphs=skip_cuda_graphs, ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 3d0c58317502..db8a881b0898 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -19,6 +19,7 @@ tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) from vllm.distributed.utils import divide +from vllm.forward_context import get_forward_context # yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, @@ -418,14 +419,44 @@ def apply(self, output = output.flatten(0, 1) x = x.flatten(0, 1) - lora_output: Optional[ - torch.Tensor] = self.punica_wrapper.add_lora_linear( - output, x, self.lora_a_stacked, self.lora_b_stacked, - self.lora_bias_stacked, 1.0, self.output_slices) - if not current_platform.can_update_inplace(): - output = lora_output - - return output + # Extract aLoRA batch metadata from forward context + alora_metadata = get_forward_context().alora_metadata + k_offsets = alora_metadata.k_offsets + query_start_locs = alora_metadata.query_start_locs + + # Build the 1D “save‐prefix” mask: + T = output.size(0) # total tokens + starts = query_start_locs[:-1] # starts and end index of each request + ends = query_start_locs[1:] + lengths = ends - starts # request lengths + kept_lens = lengths - k_offsets + kept_lens = torch.clamp( + kept_lens, + min=0) # portion of request to keep as base model weights + + device = output.device + # Create the alora mask + delta = torch.zeros(T + 1, device=device, dtype=output.dtype) + ends_for_scatter = starts + kept_lens + pos_vals = kept_lens.sign().to(output.dtype) + neg_vals = -pos_vals + delta.scatter_add_(0, starts, pos_vals) + delta.scatter_add_(0, ends_for_scatter, neg_vals) + cums = torch.cumsum(delta[:-1], dim=0) + mask1d = cums > 0 # shape [T], bool + mask2d = mask1d.unsqueeze(1).to(output.dtype) + + # Clone base layer output before running LoRA + orig_out = output.clone() + + # Apply LoRA in‐place on `output`: + self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, + self.lora_b_stacked, + self.lora_bias_stacked, 1.0, + self.output_slices) + # Apply alora mask + final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d) + return final_output @property def weight(self) -> torch.Tensor: diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 5bbba7830c1b..a566e50d6050 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -33,6 +33,8 @@ class LoRARequest( long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) tensorizer_config_dict: Optional[dict] = None + invocation_tokens: Optional[list[int]] = None + k_offset: Optional[int] = None def __post_init__(self): if self.lora_local_path: diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 588aa8deb183..f91e05fddcfa 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -9,6 +9,7 @@ import torch.nn as nn from torch.nn.parameter import Parameter, UninitializedParameter +from vllm.config import get_current_vllm_config from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -229,6 +230,12 @@ def __init__( ): super().__init__() + # tpa -- find out why this is needed + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # Keep input parameters self.input_size = input_size self.output_size = output_size diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9489bcf433fd..7eff333a124b 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -457,6 +457,20 @@ def hash_request_tokens(hash_function: Any, block_size: int, token_ids = request.all_token_ids req_need_extra_keys = need_extra_keys(request) + if (request.lora_request is not None + and request.lora_request.invocation_tokens is not None): + use_alora = True + invocation_tokens = request.lora_request.invocation_tokens + # scan backward for the last match (faster than full forward scan+max) + invocation_start = -1 + n = len(invocation_tokens) + for idx in range(len(token_ids) - n, -1, -1): + if token_ids[idx:idx + n] == invocation_tokens: + # weights activated 1 token after start + invocation_start = idx + 1 + break + else: + use_alora = False req_extra_keys = None curr_mm_idx = 0 @@ -473,6 +487,8 @@ def hash_request_tokens(hash_function: Any, block_size: int, # MM and LoRA requests need extra keys for block-hash computation. req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys( request, start, end, curr_mm_idx) + if use_alora and end <= invocation_start: + req_extra_keys = None # cache is equivalent to base model cache block_hash = hash_block_tokens(hash_function, parent_block_hash_value, block_token_ids, req_extra_keys) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2d2274ab6a4d..4bb2bbd6e2c9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -8,6 +8,7 @@ from collections.abc import Iterable from typing import Any, Optional, Union +import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import ( @@ -211,6 +212,13 @@ def schedule(self) -> SchedulerOutput: num_new_tokens, self.max_model_len - request.num_computed_tokens) + if envs.VLLM_V1_USE_DEMO_LOGGING and num_new_tokens > 1: + logger.info("request_id: %s", request.request_id) + logger.info("num_tokens: %d", request.num_tokens) + logger.info("num_computed_tokens: %d", + request.num_computed_tokens) + logger.info("num_new_tokens: %d", num_new_tokens) + # Schedule encoder inputs. encoder_inputs_to_schedule = None new_encoder_budget = encoder_budget @@ -416,6 +424,12 @@ def schedule(self) -> SchedulerOutput: # The request cannot be scheduled. break + if envs.VLLM_V1_USE_DEMO_LOGGING: + logger.info("request_id: %s", request.request_id) + logger.info("num_tokens: %d", request.num_tokens) + logger.info("num_computed_tokens: %d", num_computed_tokens) + logger.info("num_new_tokens: %d", num_new_tokens) + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index e28879d40460..d6b0c28f16fc 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +import os import time from collections.abc import Mapping, Sequence from typing import Any, Literal, Optional, Union @@ -10,6 +12,7 @@ from vllm.inputs.parse import split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.multimodal.inputs import PlaceholderRange @@ -324,6 +327,23 @@ def process_inputs( else: sorted_mm_inputs = orig_sorted_mm_inputs + # Tokenize aLoRA invocation sequence if applicable. + if lora_request is not None: + + # Load in adapter config file + lora_path = get_adapter_absolute_path(lora_request.lora_path) + lora_config_path = os.path.join(lora_path, "adapter_config.json") + with open(lora_config_path) as f: + config = json.load(f) + + if "invocation_string" in config: # check if aLoRA + invocation_tokens = self.input_preprocessor._tokenize_prompt( + config["invocation_string"], + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs) + # Make it an aLoRA request + # (in future, this will happen upstream) + lora_request.invocation_tokens = invocation_tokens return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, prompt_token_ids=decoder_inputs["prompt_token_ids"], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 558325fa0347..c0cdfdc9048a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -26,8 +26,8 @@ from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, graph_capture, prepare_communication_buffer_for_model) -from vllm.forward_context import (DPMetadata, get_forward_context, - set_forward_context) +from vllm.forward_context import (ALoRAMetadata, DPMetadata, + get_forward_context, set_forward_context) from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader @@ -746,6 +746,83 @@ def _prepare_inputs( return (attn_metadata, attention_cuda_graphs, logits_indices, spec_decode_metadata) + def _extract_offsets( + self, + scheduler_output: "SchedulerOutput", + ) -> ALoRAMetadata: + """ + Extract k_offsets for each new scheduled req that is called with aLoRA. + Prepare aLoRA metadata for model execution. + """ + + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + print(new_req_data.lora_request) + if (new_req_data.lora_request is not None and + new_req_data.lora_request.invocation_tokens is not None): + tokens = new_req_data.lora_request.invocation_tokens + prompt_ids = new_req_data.prompt_token_ids + n = len(tokens) + k_offset = -1 + # only bother if there actually are invocation tokens + if n > 0 and len(prompt_ids) >= n: + # scan backward for the last match + # (faster than full forward scan+max) + for idx in range(len(prompt_ids) - n, -1, -1): + if prompt_ids[idx:idx + n] == tokens: + # offset = number of tokens from the start + # of that match to the end of the prompt + k_offset = len(prompt_ids) - idx - 1 + break + if k_offset == -1: + raise ValueError( + "Invocation sequence not found in prompt " + f"for request '{req_id}'. aLoRA models require the " + "invocation tokens to be present in the input.") + + cached_lora_request = self.requests[req_id].lora_request + assert cached_lora_request is not None + cached_lora_request.k_offset = k_offset + + # Fill in k_offsets based on the `scheduled_new_reqs` and + # `scheduled_cached_reqs` within the SchedulerOutput. + num_seqs = len(self.query_start_loc_np.tolist()) - 1 + k_offsets = [1] * (num_seqs) + + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + req_index = self.input_batch.req_id_to_index[req_id] + cached_lora_request = self.requests[req_id].lora_request + if (cached_lora_request is not None + and cached_lora_request.k_offset is not None): + k_offsets[req_index] = cached_lora_request.k_offset + else: + k_offsets[req_index] = len( + self.requests[req_id].prompt_token_ids) + + for cached_req_data in scheduler_output.scheduled_cached_reqs: + req_id = cached_req_data.req_id + req_index = self.input_batch.req_id_to_index[req_id] + cached_lora_request = self.requests[req_id].lora_request + if (cached_lora_request is not None + and cached_lora_request.k_offset is not None): + k_offsets[req_index] = cached_lora_request.k_offset + else: + k_offsets[req_index] = len( + self.requests[req_id].prompt_token_ids) + + query_locs = torch.tensor(self.query_start_loc_np.tolist(), + device=self.device) + + if len(query_locs) > self.input_batch.num_reqs + 1: + query_locs[self.input_batch.num_reqs + 1:] = 0 + + alora_metadata = ALoRAMetadata(k_offsets=torch.tensor( + k_offsets, device=self.device), + query_start_locs=query_locs) + + return alora_metadata + def _compute_cascade_attn_prefix_len( self, num_scheduled_tokens: np.ndarray, @@ -1209,6 +1286,11 @@ def execute_model( # Prepare the decoder inputs. (attn_metadata, attention_cuda_graphs, logits_indices, spec_decode_metadata) = (self._prepare_inputs(scheduler_output)) + + # tpa - let's do this in prepare input> + # Extract the aLoRA offsets if applicable. + alora_metadata = self._extract_offsets(scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -1286,6 +1368,7 @@ def execute_model( num_tokens=num_input_tokens, num_tokens_across_dp=num_tokens_across_dp, skip_cuda_graphs=skip_cuda_graphs, + alora_metadata=alora_metadata, ): self.maybe_setup_kv_connector(scheduler_output) @@ -1872,11 +1955,27 @@ def _dummy_run( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) + # Prepare dummy ALoRAMetadata + dummy_k_offsets = torch.tensor([1] * max_num_reqs, + device=self.device) + dummy_cu_num_tokens = np.cumsum(num_scheduled_tokens) + dummy_query_start_loc = [0] * (max_num_reqs + 1) + dummy_query_start_loc[0] = 0 + dummy_query_start_loc[1:num_reqs + 1] = dummy_cu_num_tokens + dummy_query_start_loc = torch.tensor(dummy_query_start_loc, + device=self.device) + dummy_alora_metadata = ALoRAMetadata( + k_offsets=dummy_k_offsets, + query_start_locs=dummy_query_start_loc, + ) + #num_reqs=num_reqs,) + with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config, num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp): + num_tokens_across_dp=num_tokens_across_dp, + alora_metadata=dummy_alora_metadata): outputs = model( input_ids=input_ids, positions=positions, From a68e70b9a109e53bf7b20c0ecac96b04944dc194 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 17 Jun 2025 07:39:42 +0000 Subject: [PATCH 02/33] Fix type hint for query_start_locs Signed-off-by: Thomas Parnell --- vllm/forward_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 144e3cc4a1d0..cf8447a5a257 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -29,7 +29,7 @@ @dataclass class ALoRAMetadata: k_offsets: torch.Tensor - query_start_locs: list[int] + query_start_locs: torch.Tensor @dataclass From b254fb7ee12825b70d1e0a60c641e1dd96d65a70 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 04:33:37 +0000 Subject: [PATCH 03/33] vllm/model_executor/layers/linear.py: add comment on torch.compile Signed-off-by: Thomas Parnell --- vllm/model_executor/layers/linear.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f91e05fddcfa..ddf55bc49adc 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -230,7 +230,8 @@ def __init__( ): super().__init__() - # tpa -- find out why this is needed + # lets torch.compile know that forward_context needs to be + # considered as an input to the layer (copied from attention) compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") From 3897b1b630e5913dbcbc89571d42412c4a0d9bc2 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 04:34:36 +0000 Subject: [PATCH 04/33] vllm/v1/worker/gpu_model_runner.py: remove print statement Signed-off-by: Thomas Parnell --- vllm/v1/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c0cdfdc9048a..a9bd1edf3cc8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -757,7 +757,6 @@ def _extract_offsets( for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id - print(new_req_data.lora_request) if (new_req_data.lora_request is not None and new_req_data.lora_request.invocation_tokens is not None): tokens = new_req_data.lora_request.invocation_tokens From 24ff3760b40860dbe1193642c8f2b20b3ed02161 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 04:36:13 +0000 Subject: [PATCH 05/33] vllm/v1/core/sched/scheduler.py: remove debug code Signed-off-by: Thomas Parnell --- vllm/v1/core/sched/scheduler.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4bb2bbd6e2c9..2d2274ab6a4d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -8,7 +8,6 @@ from collections.abc import Iterable from typing import Any, Optional, Union -import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import ( @@ -212,13 +211,6 @@ def schedule(self) -> SchedulerOutput: num_new_tokens, self.max_model_len - request.num_computed_tokens) - if envs.VLLM_V1_USE_DEMO_LOGGING and num_new_tokens > 1: - logger.info("request_id: %s", request.request_id) - logger.info("num_tokens: %d", request.num_tokens) - logger.info("num_computed_tokens: %d", - request.num_computed_tokens) - logger.info("num_new_tokens: %d", num_new_tokens) - # Schedule encoder inputs. encoder_inputs_to_schedule = None new_encoder_budget = encoder_budget @@ -424,12 +416,6 @@ def schedule(self) -> SchedulerOutput: # The request cannot be scheduled. break - if envs.VLLM_V1_USE_DEMO_LOGGING: - logger.info("request_id: %s", request.request_id) - logger.info("num_tokens: %d", request.num_tokens) - logger.info("num_computed_tokens: %d", num_computed_tokens) - logger.info("num_new_tokens: %d", num_new_tokens) - new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, From 412eacd5c9dfe805dfa3ef96e08c06363c20b385 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 04:37:01 +0000 Subject: [PATCH 06/33] vllm/envs.py Signed-off-by: Thomas Parnell --- vllm/envs.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 92d985c79fc3..921052821ee3 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -120,7 +120,6 @@ VLLM_USE_DEEP_GEMM: bool = False VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 - VLLM_V1_USE_DEMO_LOGGING: bool = True VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 @@ -836,10 +835,6 @@ def get_vllm_port() -> Optional[int]: "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")), - # Useful for demo - "VLLM_V1_USE_DEMO_LOGGING": - lambda: os.environ.get("VLLM_V1_USE_DEMO_LOGGING", "0") == "1", - # If set, allow insecure serialization using pickle. # This is useful for environments where it is deemed safe to use the # insecure method and it is needed for some reason. From 32098e40787e846304ee0be75fcf9aca724c0b07 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 18:45:11 +0000 Subject: [PATCH 07/33] Inject aLoRA behaviour via mixin Signed-off-by: Thomas Parnell --- vllm/config.py | 2 + vllm/lora/layers.py | 98 ++++++++++++++++++++++++++++----------------- vllm/lora/utils.py | 11 ++++- 3 files changed, 73 insertions(+), 38 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d986ab6b0edb..6c1ad60f2abc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2873,6 +2873,8 @@ class LoRAConfig: allowed.""" bias_enabled: bool = False """Enable bias for LoRA adapters.""" + activated_lora_enabled: bool = True + """Enable Activated LoRA.""" def compute_hash(self) -> str: """ diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index db8a881b0898..775c12a28226 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -419,44 +419,14 @@ def apply(self, output = output.flatten(0, 1) x = x.flatten(0, 1) - # Extract aLoRA batch metadata from forward context - alora_metadata = get_forward_context().alora_metadata - k_offsets = alora_metadata.k_offsets - query_start_locs = alora_metadata.query_start_locs - - # Build the 1D “save‐prefix” mask: - T = output.size(0) # total tokens - starts = query_start_locs[:-1] # starts and end index of each request - ends = query_start_locs[1:] - lengths = ends - starts # request lengths - kept_lens = lengths - k_offsets - kept_lens = torch.clamp( - kept_lens, - min=0) # portion of request to keep as base model weights - - device = output.device - # Create the alora mask - delta = torch.zeros(T + 1, device=device, dtype=output.dtype) - ends_for_scatter = starts + kept_lens - pos_vals = kept_lens.sign().to(output.dtype) - neg_vals = -pos_vals - delta.scatter_add_(0, starts, pos_vals) - delta.scatter_add_(0, ends_for_scatter, neg_vals) - cums = torch.cumsum(delta[:-1], dim=0) - mask1d = cums > 0 # shape [T], bool - mask2d = mask1d.unsqueeze(1).to(output.dtype) + lora_output: Optional[ + torch.Tensor] = self.punica_wrapper.add_lora_linear( + output, x, self.lora_a_stacked, self.lora_b_stacked, + self.lora_bias_stacked, 1.0, self.output_slices) + if not current_platform.can_update_inplace(): + output = lora_output - # Clone base layer output before running LoRA - orig_out = output.clone() - - # Apply LoRA in‐place on `output`: - self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, - self.lora_b_stacked, - self.lora_bias_stacked, 1.0, - self.output_slices) - # Apply alora mask - final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d) - return final_output + return output @property def weight(self) -> torch.Tensor: @@ -1314,3 +1284,57 @@ def can_replace_layer( def extra_repr(self) -> str: return self.base_layer.extra_repr() + + +class ActivatedLoRAMixin: + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + + # In transformers backend, x and output have extra batch dimension like + # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), + # therefore we need to flatten the batch dimensions. + if x.ndim == 3 and output.ndim == 3: + output = output.flatten(0, 1) + x = x.flatten(0, 1) + + # Extract aLoRA batch metadata from forward context + alora_metadata = get_forward_context().alora_metadata + k_offsets = alora_metadata.k_offsets + query_start_locs = alora_metadata.query_start_locs + + # Build the 1D “save‐prefix” mask: + T = output.size(0) # total tokens + starts = query_start_locs[:-1] # starts and end index of each request + ends = query_start_locs[1:] + lengths = ends - starts # request lengths + kept_lens = lengths - k_offsets + kept_lens = torch.clamp( + kept_lens, + min=0) # portion of request to keep as base model weights + + device = output.device + # Create the alora mask + delta = torch.zeros(T + 1, device=device, dtype=output.dtype) + ends_for_scatter = starts + kept_lens + pos_vals = kept_lens.sign().to(output.dtype) + neg_vals = -pos_vals + delta.scatter_add_(0, starts, pos_vals) + delta.scatter_add_(0, ends_for_scatter, neg_vals) + cums = torch.cumsum(delta[:-1], dim=0) + mask1d = cums > 0 # shape [T], bool + mask2d = mask1d.unsqueeze(1).to(output.dtype) + + # Clone base layer output before running LoRA + orig_out = output.clone() + + # Apply LoRA in‐place on `output`: + self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, + self.lora_b_stacked, + self.lora_bias_stacked, 1.0, + self.output_slices) + # Apply alora mask + final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d) + return final_output diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index ee196e3f689a..6bafc6a00a59 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -21,7 +21,8 @@ # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable -from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, +from vllm.lora.layers import (ActivatedLoRAMixin, BaseLayerWithLoRA, + ColumnParallelLinearWithLoRA, LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -67,6 +68,14 @@ def from_layer(layer: nn.Module, lora_config=lora_config, packed_modules_list=packed_modules_list, model_config=model_config): + + # inject a-LoRA behaviour + if (lora_config.activated_lora_enabled + and lora_cls is MergedQKVParallelLinearWithLoRA): + lora_cls = type( + lora_cls.__name__.replace("LoRA", "ActivatedLoRA"), + (ActivatedLoRAMixin, lora_cls), {}) + instance_layer = lora_cls(layer) instance_layer.create_lora_weights(max_loras, lora_config, model_config) From fb6d28ec829637aaf816e774353bf8513d9e3891 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 19:16:13 +0000 Subject: [PATCH 08/33] Simpler implementation without mixin Signed-off-by: Thomas Parnell --- vllm/lora/layers.py | 19 +++++++++++++++++-- vllm/lora/utils.py | 13 +++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 775c12a28226..5d653a5ac4e6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -875,7 +875,8 @@ def can_replace_layer( model_config: Optional[PretrainedConfig], ) -> bool: return (type(source_layer) is QKVParallelLinear - and len(packed_modules_list) == 3) + and len(packed_modules_list) == 3 + and not lora_config.activated_lora_enabled) #TODO: Implement this @@ -1286,7 +1287,8 @@ def extra_repr(self) -> str: return self.base_layer.extra_repr() -class ActivatedLoRAMixin: +class MergedQKVParallelLinearWithActivatedLoRA(MergedQKVParallelLinearWithLoRA + ): def apply(self, x: torch.Tensor, @@ -1338,3 +1340,16 @@ def apply(self, # Apply alora mask final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d) return final_output + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + """Returns True if the layer can be replaced by this LoRA layer.""" + return (type(source_layer) is QKVParallelLinear + and len(packed_modules_list) == 3 + and lora_config.activated_lora_enabled) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 6bafc6a00a59..f5ad741fdccc 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -21,11 +21,11 @@ # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable -from vllm.lora.layers import (ActivatedLoRAMixin, BaseLayerWithLoRA, - ColumnParallelLinearWithLoRA, +from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, + MergedQKVParallelLinearWithActivatedLoRA, MergedQKVParallelLinearWithLoRA, QKVParallelLinearWithLoRA, ReplicatedLinearWithLoRA, @@ -45,6 +45,7 @@ MergedColumnParallelLinearWithLoRA, QKVParallelLinearWithLoRA, MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithActivatedLoRA, RowParallelLinearWithLoRA, ReplicatedLinearWithLoRA, LogitsProcessorWithLoRA, @@ -68,14 +69,6 @@ def from_layer(layer: nn.Module, lora_config=lora_config, packed_modules_list=packed_modules_list, model_config=model_config): - - # inject a-LoRA behaviour - if (lora_config.activated_lora_enabled - and lora_cls is MergedQKVParallelLinearWithLoRA): - lora_cls = type( - lora_cls.__name__.replace("LoRA", "ActivatedLoRA"), - (ActivatedLoRAMixin, lora_cls), {}) - instance_layer = lora_cls(layer) instance_layer.create_lora_weights(max_loras, lora_config, model_config) From 5f62d8beeb439b7f4ec37c126653ca829c2482fb Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 22:15:01 +0000 Subject: [PATCH 09/33] Scan for invocation tokens in one place Signed-off-by: Thomas Parnell --- vllm/forward_context.py | 2 +- vllm/lora/layers.py | 6 +- vllm/lora/request.py | 2 +- vllm/v1/core/kv_cache_utils.py | 22 ++--- vllm/v1/engine/processor.py | 30 ++++++- vllm/v1/worker/gpu_model_runner.py | 129 ++++++++--------------------- 6 files changed, 70 insertions(+), 121 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index cf8447a5a257..54248f89b385 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -29,7 +29,7 @@ @dataclass class ALoRAMetadata: k_offsets: torch.Tensor - query_start_locs: torch.Tensor + query_start_loc: torch.Tensor @dataclass diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 5d653a5ac4e6..da25de8317e9 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1305,12 +1305,12 @@ def apply(self, # Extract aLoRA batch metadata from forward context alora_metadata = get_forward_context().alora_metadata k_offsets = alora_metadata.k_offsets - query_start_locs = alora_metadata.query_start_locs + query_start_loc = alora_metadata.query_start_loc # Build the 1D “save‐prefix” mask: T = output.size(0) # total tokens - starts = query_start_locs[:-1] # starts and end index of each request - ends = query_start_locs[1:] + starts = query_start_loc[:-1] # starts and end index of each request + ends = query_start_loc[1:] lengths = ends - starts # request lengths kept_lens = lengths - k_offsets kept_lens = torch.clamp( diff --git a/vllm/lora/request.py b/vllm/lora/request.py index a566e50d6050..c5851af8c21b 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -33,7 +33,7 @@ class LoRARequest( long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) tensorizer_config_dict: Optional[dict] = None - invocation_tokens: Optional[list[int]] = None + invocation_start: Optional[int] = None k_offset: Optional[int] = None def __post_init__(self): diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 7eff333a124b..b5a0af9cb762 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -457,20 +457,6 @@ def hash_request_tokens(hash_function: Any, block_size: int, token_ids = request.all_token_ids req_need_extra_keys = need_extra_keys(request) - if (request.lora_request is not None - and request.lora_request.invocation_tokens is not None): - use_alora = True - invocation_tokens = request.lora_request.invocation_tokens - # scan backward for the last match (faster than full forward scan+max) - invocation_start = -1 - n = len(invocation_tokens) - for idx in range(len(token_ids) - n, -1, -1): - if token_ids[idx:idx + n] == invocation_tokens: - # weights activated 1 token after start - invocation_start = idx + 1 - break - else: - use_alora = False req_extra_keys = None curr_mm_idx = 0 @@ -487,8 +473,12 @@ def hash_request_tokens(hash_function: Any, block_size: int, # MM and LoRA requests need extra keys for block-hash computation. req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys( request, start, end, curr_mm_idx) - if use_alora and end <= invocation_start: - req_extra_keys = None # cache is equivalent to base model cache + # Respect a-LoRA behaviour + if (request.lora_request is not None + and request.lora_request.invocation_start is not None + and end <= request.lora_request.invocation_start): + # cache is equivalent to base model cache + req_extra_keys = None block_hash = hash_block_tokens(hash_function, parent_block_hash_value, block_token_ids, req_extra_keys) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index d6b0c28f16fc..6861525b856c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -330,20 +330,42 @@ def process_inputs( # Tokenize aLoRA invocation sequence if applicable. if lora_request is not None: + # tpa: can we get this from PeftHelper somehow? # Load in adapter config file lora_path = get_adapter_absolute_path(lora_request.lora_path) lora_config_path = os.path.join(lora_path, "adapter_config.json") with open(lora_config_path) as f: config = json.load(f) - if "invocation_string" in config: # check if aLoRA + if "invocation_string" in config: + invocation_tokens = self.input_preprocessor._tokenize_prompt( config["invocation_string"], lora_request=lora_request, tokenization_kwargs=tokenization_kwargs) - # Make it an aLoRA request - # (in future, this will happen upstream) - lora_request.invocation_tokens = invocation_tokens + + invocation_start = -1 + n = len(invocation_tokens) + token_ids = decoder_inputs["prompt_token_ids"] + + if n > 0 and len(token_ids) >= n: + # scan backward for the last match + # (faster than full forward scan+max) + for idx in range(len(token_ids) - n, -1, -1): + if token_ids[idx:idx + n] == invocation_tokens: + # weights activated 1 token after start + invocation_start = idx + 1 + break + + if invocation_start == -1: + raise ValueError( + "Invocation sequence not found in prompt " + f"for request '{request_id}'. aLoRA models require the " + "invocation tokens to be present in the input.") + + lora_request.invocation_start = invocation_start + lora_request.k_offset = len(token_ids) - invocation_start + return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, prompt_token_ids=decoder_inputs["prompt_token_ids"], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a9bd1edf3cc8..83fc2f7d3867 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -557,7 +557,7 @@ def _prepare_inputs( self, scheduler_output: "SchedulerOutput", ) -> tuple[dict[str, Any], bool, torch.Tensor, - Optional[SpecDecodeMetadata]]: + Optional[SpecDecodeMetadata], Optional[ALoRAMetadata]]: """ :return: tuple[ attn_metadata: layer-to-attention_metadata mapping, @@ -743,84 +743,28 @@ def _prepare_inputs( if self.lora_config: self.set_active_loras(self.input_batch, num_scheduled_tokens) - return (attn_metadata, attention_cuda_graphs, logits_indices, - spec_decode_metadata) - - def _extract_offsets( - self, - scheduler_output: "SchedulerOutput", - ) -> ALoRAMetadata: - """ - Extract k_offsets for each new scheduled req that is called with aLoRA. - Prepare aLoRA metadata for model execution. - """ - - for new_req_data in scheduler_output.scheduled_new_reqs: - req_id = new_req_data.req_id - if (new_req_data.lora_request is not None and - new_req_data.lora_request.invocation_tokens is not None): - tokens = new_req_data.lora_request.invocation_tokens - prompt_ids = new_req_data.prompt_token_ids - n = len(tokens) - k_offset = -1 - # only bother if there actually are invocation tokens - if n > 0 and len(prompt_ids) >= n: - # scan backward for the last match - # (faster than full forward scan+max) - for idx in range(len(prompt_ids) - n, -1, -1): - if prompt_ids[idx:idx + n] == tokens: - # offset = number of tokens from the start - # of that match to the end of the prompt - k_offset = len(prompt_ids) - idx - 1 - break - if k_offset == -1: - raise ValueError( - "Invocation sequence not found in prompt " - f"for request '{req_id}'. aLoRA models require the " - "invocation tokens to be present in the input.") - + # Compute a-LoRA metadata + if self.lora_config.activated_lora_enabled: + k_offsets = [1] * (num_reqs) + for req_id in self.input_batch.req_ids: + req_index = self.input_batch.req_id_to_index[req_id] cached_lora_request = self.requests[req_id].lora_request - assert cached_lora_request is not None - cached_lora_request.k_offset = k_offset - - # Fill in k_offsets based on the `scheduled_new_reqs` and - # `scheduled_cached_reqs` within the SchedulerOutput. - num_seqs = len(self.query_start_loc_np.tolist()) - 1 - k_offsets = [1] * (num_seqs) - - for new_req_data in scheduler_output.scheduled_new_reqs: - req_id = new_req_data.req_id - req_index = self.input_batch.req_id_to_index[req_id] - cached_lora_request = self.requests[req_id].lora_request - if (cached_lora_request is not None - and cached_lora_request.k_offset is not None): - k_offsets[req_index] = cached_lora_request.k_offset - else: - k_offsets[req_index] = len( - self.requests[req_id].prompt_token_ids) - - for cached_req_data in scheduler_output.scheduled_cached_reqs: - req_id = cached_req_data.req_id - req_index = self.input_batch.req_id_to_index[req_id] - cached_lora_request = self.requests[req_id].lora_request - if (cached_lora_request is not None - and cached_lora_request.k_offset is not None): - k_offsets[req_index] = cached_lora_request.k_offset - else: - k_offsets[req_index] = len( - self.requests[req_id].prompt_token_ids) - - query_locs = torch.tensor(self.query_start_loc_np.tolist(), - device=self.device) - - if len(query_locs) > self.input_batch.num_reqs + 1: - query_locs[self.input_batch.num_reqs + 1:] = 0 + if (cached_lora_request is not None + and cached_lora_request.k_offset is not None): + k_offsets[req_index] = cached_lora_request.k_offset + else: + k_offsets[req_index] = len( + self.requests[req_id].prompt_token_ids) - alora_metadata = ALoRAMetadata(k_offsets=torch.tensor( - k_offsets, device=self.device), - query_start_locs=query_locs) + alora_metadata = ALoRAMetadata( + k_offsets=torch.tensor(k_offsets, device=self.device), + query_start_loc=query_start_loc.to(torch.int64), + ) + else: + alora_metadata = None - return alora_metadata + return (attn_metadata, attention_cuda_graphs, logits_indices, + spec_decode_metadata, alora_metadata) def _compute_cascade_attn_prefix_len( self, @@ -1284,11 +1228,8 @@ def execute_model( # Prepare the decoder inputs. (attn_metadata, attention_cuda_graphs, logits_indices, - spec_decode_metadata) = (self._prepare_inputs(scheduler_output)) - - # tpa - let's do this in prepare input> - # Extract the aLoRA offsets if applicable. - alora_metadata = self._extract_offsets(scheduler_output) + spec_decode_metadata, + alora_metadata) = (self._prepare_inputs(scheduler_output)) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph @@ -1954,27 +1895,23 @@ def _dummy_run( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) - # Prepare dummy ALoRAMetadata - dummy_k_offsets = torch.tensor([1] * max_num_reqs, - device=self.device) - dummy_cu_num_tokens = np.cumsum(num_scheduled_tokens) - dummy_query_start_loc = [0] * (max_num_reqs + 1) - dummy_query_start_loc[0] = 0 - dummy_query_start_loc[1:num_reqs + 1] = dummy_cu_num_tokens - dummy_query_start_loc = torch.tensor(dummy_query_start_loc, - device=self.device) - dummy_alora_metadata = ALoRAMetadata( - k_offsets=dummy_k_offsets, - query_start_locs=dummy_query_start_loc, - ) - #num_reqs=num_reqs,) + if self.lora_config.activated_lora_enabled: + k_offsets = torch.tensor([1] * num_reqs, device=self.device) + query_start_loc = self.query_start_loc[:num_reqs + 1].to( + torch.int64) + alora_metadata = ALoRAMetadata( + k_offsets=k_offsets, + query_start_loc=query_start_loc, + ) + else: + alora_metadata = None with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config, num_tokens=num_tokens, num_tokens_across_dp=num_tokens_across_dp, - alora_metadata=dummy_alora_metadata): + alora_metadata=alora_metadata): outputs = model( input_ids=input_ids, positions=positions, From f9396b0d01d0f9275fdd90e17075de05d59da9f1 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 18 Jun 2025 22:23:35 +0000 Subject: [PATCH 10/33] Just use single field in request Signed-off-by: Thomas Parnell --- vllm/lora/request.py | 1 - vllm/v1/core/kv_cache_utils.py | 4 ++-- vllm/v1/engine/processor.py | 9 ++++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index c5851af8c21b..d0f39f852190 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -33,7 +33,6 @@ class LoRARequest( long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) tensorizer_config_dict: Optional[dict] = None - invocation_start: Optional[int] = None k_offset: Optional[int] = None def __post_init__(self): diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index b5a0af9cb762..6dbbe208b785 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -475,8 +475,8 @@ def hash_request_tokens(hash_function: Any, block_size: int, request, start, end, curr_mm_idx) # Respect a-LoRA behaviour if (request.lora_request is not None - and request.lora_request.invocation_start is not None - and end <= request.lora_request.invocation_start): + and request.lora_request.k_offset is not None and end + <= (len(token_ids) - request.lora_request.k_offset)): # cache is equivalent to base model cache req_extra_keys = None diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6861525b856c..58abf1e1e6b3 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -344,7 +344,7 @@ def process_inputs( lora_request=lora_request, tokenization_kwargs=tokenization_kwargs) - invocation_start = -1 + k_offset = -1 n = len(invocation_tokens) token_ids = decoder_inputs["prompt_token_ids"] @@ -354,17 +354,16 @@ def process_inputs( for idx in range(len(token_ids) - n, -1, -1): if token_ids[idx:idx + n] == invocation_tokens: # weights activated 1 token after start - invocation_start = idx + 1 + k_offset = len(token_ids) - idx - 1 break - if invocation_start == -1: + if k_offset == -1: raise ValueError( "Invocation sequence not found in prompt " f"for request '{request_id}'. aLoRA models require the " "invocation tokens to be present in the input.") - lora_request.invocation_start = invocation_start - lora_request.k_offset = len(token_ids) - invocation_start + lora_request.k_offset = k_offset return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, From 6f36f6d0f93c670bfe4605c9e89a3b6d99a8883f Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 04:08:40 +0000 Subject: [PATCH 11/33] Use peft_helper instead of reading files directly Signed-off-by: Thomas Parnell --- vllm/lora/peft_helper.py | 2 ++ vllm/v1/engine/processor.py | 21 +++++++++------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index a20d73f0f725..edd7be16fdff 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -37,6 +37,8 @@ class PEFTHelper: use_dora: bool = field(default=False) # long context lora field context_length: int = field(default=0) + # Invocation string for Activated LoRA (aLoRA, see: https://arxiv.org/abs/2504.12397) + invocation_string: Optional[str] = field(default=None) # Extra vllm field, start with 'vllm_' to avoid conflict vllm_lora_scaling_factor: float = field(default=1.0) vllm_max_position_embeddings: Optional[int] = field(default=False) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 58abf1e1e6b3..0a6de2c3b08f 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import json -import os import time from collections.abc import Mapping, Sequence from typing import Any, Literal, Optional, Union @@ -11,8 +9,8 @@ from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.multimodal.inputs import PlaceholderRange @@ -328,19 +326,18 @@ def process_inputs( sorted_mm_inputs = orig_sorted_mm_inputs # Tokenize aLoRA invocation sequence if applicable. - if lora_request is not None: + if self.lora_config.activated_lora_enabled and lora_request is not None: - # tpa: can we get this from PeftHelper somehow? - # Load in adapter config file - lora_path = get_adapter_absolute_path(lora_request.lora_path) - lora_config_path = os.path.join(lora_path, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) + text_config = self.model_config.hf_config.get_text_config() - if "invocation_string" in config: + peft_helper = PEFTHelper.from_local_dir( + lora_request.lora_path, text_config.max_position_embeddings, + lora_request.tensorizer_config_dict) + + if peft_helper.invocation_string is not None: invocation_tokens = self.input_preprocessor._tokenize_prompt( - config["invocation_string"], + peft_helper.invocation_string, lora_request=lora_request, tokenization_kwargs=tokenization_kwargs) From c6ffe8f8aca86faafae720a8e5b2f994d577f3e0 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 04:12:10 +0000 Subject: [PATCH 12/33] Remove online example for now. Signed-off-by: Thomas Parnell --- ...ra_testing.py => alora_offline_example.py} | 0 examples/alora/alora_server_testing.py | 67 ------------------- examples/alora/alora_server_testing.sh | 46 ------------- 3 files changed, 113 deletions(-) rename examples/alora/{new_alora_testing.py => alora_offline_example.py} (100%) delete mode 100644 examples/alora/alora_server_testing.py delete mode 100644 examples/alora/alora_server_testing.sh diff --git a/examples/alora/new_alora_testing.py b/examples/alora/alora_offline_example.py similarity index 100% rename from examples/alora/new_alora_testing.py rename to examples/alora/alora_offline_example.py diff --git a/examples/alora/alora_server_testing.py b/examples/alora/alora_server_testing.py deleted file mode 100644 index e9616600a54d..000000000000 --- a/examples/alora/alora_server_testing.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# After starting server using "vllm serve --enable_lora --lora_modules..." - -import time - -from openai import OpenAI - -model_id = "ibm-granite/granite-3.2-8b-instruct" - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -BASE_NAME = "ibm-granite/granite-3.2-8b-instruct" -ALORA_NAME = "new_alora" # "ibm-granite/granite-3.2-8b-alora-uncertainty" -invocation_string = "<|start_of_role|>certainty<|end_of_role|>" - -################################################################### -prompts = [ - "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>", - "What is MIT?", - ( - "<|start_of_role|>user<|end_of_role|>What is the capital of " - "Massachusetts?<|end_of_text|>\n" - ), - "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>", - ( - "<|start_of_role|>user<|end_of_role|>What is the capital of " - "Massachusetts?<|end_of_text|>\n" - ), - "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>", -] - -# Base model call -outputs_base = client.completions.create( - model=BASE_NAME, prompt=prompts, temperature=0, max_tokens=600 -) - -choices = outputs_base.choices -generated_text = [] -for i in range(len(prompts)): - prompt = prompts[i] - - generated_text += [outputs_base.choices[i].text] - print(f"Prompt: {prompt!r}, Generated text: {generated_text[-1]!r}") - -prompts_alora = [ - x + y + "<|end_of_text|>\n" + invocation_string - for x, y in zip(prompts, generated_text) -] - -# Base model with aLoRA call -t0 = time.time() -alora_outputs = client.completions.create( - model=ALORA_NAME, prompt=prompts_alora, temperature=0, max_tokens=10 -) -t = time.time() - t0 -print(f"Time: {t}") -for i in range(len(prompts_alora)): - prompt = prompts_alora[i] - generated_text = alora_outputs.choices[i].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/alora/alora_server_testing.sh b/examples/alora/alora_server_testing.sh deleted file mode 100644 index 49eb9c5612f3..000000000000 --- a/examples/alora/alora_server_testing.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# More documentation: https://docs.vllm.ai/en/v0.8.3/serving/openai_compatible_server.html#vllm-serve -export VLLM_USE_V1="1" -# Specify base model (and optionally loras) to load in when starting the server. -vllm serve ibm-granite/granite-3.2-8b-instruct \ - --enable-lora \ - --lora-modules '{"name": "new_alora", "path": "/proj/dmfexp/statllm/users/kgreenewald/.cache/huggingface/models/hub/models--ibm-granite--granite-3.2-8b-alora-uncertainty/snapshots/6109ad88201426003e696d023ec67c19e7f3d444", "base_model_name": "ibm-granite/granite-3.2-8b-instruct"}' \ - --dtype bfloat16 \ - --max-lora-rank 64 \ - --enable-prefix-caching -#--no-enable-prefix-caching -# Check that the lora model is listed along with other models. -#curl localhost:8000/v1/models | jq . - -########################################### - -# A second option is to enable dynamic adapter loading instead of at start-up. -#export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True - -#curl -X POST http://localhost:8000/v1/load_lora_adapter \ -#-H "Content-Type: application/json" \ -#-d '{ -# "lora_name": "new_alora", -# "lora_path": "/path/to/new_alora" -#}' -# Should return "200 OK - Success: LoRA adapter 'new_alora' added successfully" - -# Example of dynamically unloading an adapter. -# curl -X POST http://localhost:8000/v1/unload_lora_adapter \ -# -H "Content-Type: application/json" \ -# -d '{ -# "lora_name": "new_alora" -# }' - -########################################### - -# Send a request using the new aLoRA -#curl http://localhost:8000/v1/completions \ -# -H "Content-Type: application/json" \ -# -d '{ -# "model": "new_alora", -# "prompt": ""What is MIT?"", -# "max_tokens": 600, -# "temperature": 0 -# }' | jq From 4a4b568ccbaf67961883ec0900c7d36ead99b3aa Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 09:43:19 +0000 Subject: [PATCH 13/33] Further simplification; works with chunked prefill; correct output with torch.compile Signed-off-by: Thomas Parnell --- vllm/forward_context.py | 3 +-- vllm/lora/layers.py | 25 ++--------------- vllm/lora/request.py | 2 +- vllm/model_executor/layers/linear.py | 14 +++++----- vllm/v1/core/kv_cache_utils.py | 4 +-- vllm/v1/engine/processor.py | 8 +++--- vllm/v1/worker/gpu_model_runner.py | 40 +++++++++++++++------------- 7 files changed, 40 insertions(+), 56 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 54248f89b385..a9870dae6e60 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -28,8 +28,7 @@ @dataclass class ALoRAMetadata: - k_offsets: torch.Tensor - query_start_loc: torch.Tensor + mask1d: torch.Tensor @dataclass diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index da25de8317e9..145d3bbcc5b8 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1304,29 +1304,8 @@ def apply(self, # Extract aLoRA batch metadata from forward context alora_metadata = get_forward_context().alora_metadata - k_offsets = alora_metadata.k_offsets - query_start_loc = alora_metadata.query_start_loc - - # Build the 1D “save‐prefix” mask: - T = output.size(0) # total tokens - starts = query_start_loc[:-1] # starts and end index of each request - ends = query_start_loc[1:] - lengths = ends - starts # request lengths - kept_lens = lengths - k_offsets - kept_lens = torch.clamp( - kept_lens, - min=0) # portion of request to keep as base model weights - - device = output.device - # Create the alora mask - delta = torch.zeros(T + 1, device=device, dtype=output.dtype) - ends_for_scatter = starts + kept_lens - pos_vals = kept_lens.sign().to(output.dtype) - neg_vals = -pos_vals - delta.scatter_add_(0, starts, pos_vals) - delta.scatter_add_(0, ends_for_scatter, neg_vals) - cums = torch.cumsum(delta[:-1], dim=0) - mask1d = cums > 0 # shape [T], bool + + mask1d = alora_metadata.mask1d mask2d = mask1d.unsqueeze(1).to(output.dtype) # Clone base layer output before running LoRA diff --git a/vllm/lora/request.py b/vllm/lora/request.py index d0f39f852190..64762f15ec2e 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -33,7 +33,7 @@ class LoRARequest( long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) tensorizer_config_dict: Optional[dict] = None - k_offset: Optional[int] = None + invocation_start: Optional[int] = None def __post_init__(self): if self.lora_local_path: diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index ddf55bc49adc..9d506b87539d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -230,12 +230,14 @@ def __init__( ): super().__init__() - # lets torch.compile know that forward_context needs to be - # considered as an input to the layer (copied from attention) - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self + vllm_config = get_current_vllm_config() + if vllm_config.lora_config.activated_lora_enabled: + # lets torch.compile know that forward_context needs to be + # considered as an input to the layer (copied from attention) + compilation_config = vllm_config.compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self # Keep input parameters self.input_size = input_size diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6dbbe208b785..b5a0af9cb762 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -475,8 +475,8 @@ def hash_request_tokens(hash_function: Any, block_size: int, request, start, end, curr_mm_idx) # Respect a-LoRA behaviour if (request.lora_request is not None - and request.lora_request.k_offset is not None and end - <= (len(token_ids) - request.lora_request.k_offset)): + and request.lora_request.invocation_start is not None + and end <= request.lora_request.invocation_start): # cache is equivalent to base model cache req_extra_keys = None diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0a6de2c3b08f..40ac57951c27 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -341,7 +341,7 @@ def process_inputs( lora_request=lora_request, tokenization_kwargs=tokenization_kwargs) - k_offset = -1 + invocation_start = -1 n = len(invocation_tokens) token_ids = decoder_inputs["prompt_token_ids"] @@ -351,16 +351,16 @@ def process_inputs( for idx in range(len(token_ids) - n, -1, -1): if token_ids[idx:idx + n] == invocation_tokens: # weights activated 1 token after start - k_offset = len(token_ids) - idx - 1 + invocation_start = idx + 1 break - if k_offset == -1: + if invocation_start == -1: raise ValueError( "Invocation sequence not found in prompt " f"for request '{request_id}'. aLoRA models require the " "invocation tokens to be present in the input.") - lora_request.k_offset = k_offset + lora_request.invocation_start = invocation_start return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 83fc2f7d3867..1bed6d3c680c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -229,6 +229,11 @@ def __init__( dtype=torch.int64, device=self.device) + if self.lora_config.activated_lora_enabled: + self.mask1d = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=self.device) + # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: Optional[IntermediateTensors] = None @@ -745,21 +750,24 @@ def _prepare_inputs( # Compute a-LoRA metadata if self.lora_config.activated_lora_enabled: - k_offsets = [1] * (num_reqs) + invocation_start = np.empty(shape=(num_reqs, ), dtype=int) for req_id in self.input_batch.req_ids: req_index = self.input_batch.req_id_to_index[req_id] cached_lora_request = self.requests[req_id].lora_request if (cached_lora_request is not None - and cached_lora_request.k_offset is not None): - k_offsets[req_index] = cached_lora_request.k_offset + and cached_lora_request.invocation_start is not None): + invocation_start[ + req_index] = cached_lora_request.invocation_start else: - k_offsets[req_index] = len( + invocation_start[req_index] = len( self.requests[req_id].prompt_token_ids) - - alora_metadata = ALoRAMetadata( - k_offsets=torch.tensor(k_offsets, device=self.device), - query_start_loc=query_start_loc.to(torch.int64), - ) + mask1d_cpu = torch.tensor(positions_np + < invocation_start[req_indices], + dtype=torch.bool, + device="cpu") + mask1d = self.mask1d[:total_num_scheduled_tokens] + mask1d.copy_(mask1d_cpu, non_blocking=True) + alora_metadata = ALoRAMetadata(mask1d=mask1d) else: alora_metadata = None @@ -1895,16 +1903,12 @@ def _dummy_run( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) + alora_metadata = None if self.lora_config.activated_lora_enabled: - k_offsets = torch.tensor([1] * num_reqs, device=self.device) - query_start_loc = self.query_start_loc[:num_reqs + 1].to( - torch.int64) - alora_metadata = ALoRAMetadata( - k_offsets=k_offsets, - query_start_loc=query_start_loc, - ) - else: - alora_metadata = None + mask1d = self.mask1d[:num_tokens] + alora_metadata = ALoRAMetadata(mask1d=mask1d) + # needed to avoid guard failures + torch._dynamo.mark_dynamic(alora_metadata.mask1d, 0) with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, From 4cbef84038c4406db6b4d40cd70a477fb358484f Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 14:10:26 +0000 Subject: [PATCH 14/33] Add enable_activated_lora engine arg Signed-off-by: Thomas Parnell --- examples/alora/alora_offline_example.py | 6 ++---- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 4 ++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/alora/alora_offline_example.py b/examples/alora/alora_offline_example.py index 5d3908ff8f24..c3b049ec5871 100644 --- a/examples/alora/alora_offline_example.py +++ b/examples/alora/alora_offline_example.py @@ -10,11 +10,11 @@ from vllm.lora.request import LoRARequest BASE_NAME = "ibm-granite/granite-3.2-8b-instruct" + ALORA_NAME = "ibm-granite/granite-3.2-8b-alora-uncertainty" invocation_string = "<|start_of_role|>certainty<|end_of_role|>" os.environ["VLLM_USE_V1"] = "1" -os.environ["VLLM_V1_USE_DEMO_LOGGING"] = "1" # download your LoRA adapter to ~/.cache/huggingface/… alora_path = snapshot_download(repo_id=ALORA_NAME) @@ -26,11 +26,9 @@ llm = LLM( model=BASE_NAME, enable_lora=True, - enforce_eager=True, + enable_activated_lora=True, dtype=torch.bfloat16, - enable_prefix_caching=True, # enable APC max_lora_rank=64, - enable_chunked_prefill=False, ) prompts = [ diff --git a/vllm/config.py b/vllm/config.py index 6c1ad60f2abc..fdbb6364301d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2873,7 +2873,7 @@ class LoRAConfig: allowed.""" bias_enabled: bool = False """Enable bias for LoRA adapters.""" - activated_lora_enabled: bool = True + activated_lora_enabled: bool = False """Enable Activated LoRA.""" def compute_hash(self) -> str: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f599d7a3bb5e..e1751ccb8efb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -355,6 +355,7 @@ class EngineArgs: # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled + enable_activated_lora: bool = LoRAConfig.activated_lora_enabled max_loras: int = LoRAConfig.max_loras max_lora_rank: int = LoRAConfig.max_lora_rank fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras @@ -733,6 +734,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="If True, enable handling of LoRA adapters.") lora_group.add_argument("--enable-lora-bias", **lora_kwargs["bias_enabled"]) + lora_group.add_argument("--enable-activated-lora", + **lora_kwargs["activated_lora"]) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) @@ -1190,6 +1193,7 @@ def create_engine_config( lora_config = LoRAConfig( bias_enabled=self.enable_lora_bias, + activated_lora_enabled=self.enable_activated_lora, max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, fully_sharded_loras=self.fully_sharded_loras, From 49a5bdc9e617e63f52e62d563027c80f1961095c Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 14:19:48 +0000 Subject: [PATCH 15/33] Disable tqdm in example Signed-off-by: Thomas Parnell --- examples/alora/alora_offline_example.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/alora/alora_offline_example.py b/examples/alora/alora_offline_example.py index c3b049ec5871..4133ac341d23 100644 --- a/examples/alora/alora_offline_example.py +++ b/examples/alora/alora_offline_example.py @@ -43,6 +43,7 @@ outputsBase = llm.generate( prompts, sampling_params, + use_tqdm=False, ) generated_text = [] for output in outputsBase: @@ -62,6 +63,7 @@ prompts_alora, sampling_params, lora_request=LoRARequest("UQ_adapter", 1, alora_path), + use_tqdm=False, ) t = time.time() - t0 print(f"Time: {t}") From 5c2e1815900b1d3075ed95535c0be66510575ad2 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 14:46:09 +0000 Subject: [PATCH 16/33] Trigger Build Signed-off-by: Thomas Parnell From ceae7c7f45040a602fa919471d9d9b752995a652 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 17:04:29 +0000 Subject: [PATCH 17/33] vllm/model_executor/layers/linear.py: check lora_config exists before checking activated lora flag Signed-off-by: Thomas Parnell --- vllm/model_executor/layers/linear.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 9d506b87539d..0648e8e6a199 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -231,7 +231,8 @@ def __init__( super().__init__() vllm_config = get_current_vllm_config() - if vllm_config.lora_config.activated_lora_enabled: + if (vllm_config.lora_config + and vllm_config.lora_config.activated_lora_enabled): # lets torch.compile know that forward_context needs to be # considered as an input to the layer (copied from attention) compilation_config = vllm_config.compilation_config From 99b8b60ad7ef004b7343ebb9f6eea77667377b2f Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 17:23:45 +0000 Subject: [PATCH 18/33] arg_utils.py: fix typo Signed-off-by: Thomas Parnell --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5de42cac7feb..82f33acc4fec 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -735,7 +735,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: lora_group.add_argument("--enable-lora-bias", **lora_kwargs["bias_enabled"]) lora_group.add_argument("--enable-activated-lora", - **lora_kwargs["activated_lora"]) + **lora_kwargs["activated_lora_enabled"]) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) From 477ab6eb14d01f8a51562a538bd6c8c76adff6eb Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 19 Jun 2025 18:27:57 +0000 Subject: [PATCH 19/33] Additional checking of lora_config Signed-off-by: Thomas Parnell --- vllm/v1/engine/processor.py | 3 ++- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index e531363829fa..c783ced22bd4 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -330,7 +330,8 @@ def process_inputs( sorted_mm_inputs = orig_sorted_mm_inputs # Tokenize aLoRA invocation sequence if applicable. - if self.lora_config.activated_lora_enabled and lora_request is not None: + if (self.lora_config and self.lora_config.activated_lora_enabled + and lora_request is not None): text_config = self.model_config.hf_config.get_text_config() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e5ab1018bbce..917931eacbe4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -238,7 +238,7 @@ def __init__( dtype=torch.int64, device=self.device) - if self.lora_config.activated_lora_enabled: + if self.lora_config and self.lora_config.activated_lora_enabled: self.mask1d = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -762,7 +762,7 @@ def _prepare_inputs( self.set_active_loras(self.input_batch, num_scheduled_tokens) # Compute a-LoRA metadata - if self.lora_config.activated_lora_enabled: + if self.lora_config and self.lora_config.activated_lora_enabled: invocation_start = np.empty(shape=(num_reqs, ), dtype=int) for req_id in self.input_batch.req_ids: req_index = self.input_batch.req_id_to_index[req_id] @@ -1967,7 +1967,7 @@ def _dummy_run( num_tokens, None, False) alora_metadata = None - if self.lora_config.activated_lora_enabled: + if self.lora_config and self.lora_config.activated_lora_enabled: mask1d = self.mask1d[:num_tokens] alora_metadata = ALoRAMetadata(mask1d=mask1d) # needed to avoid guard failures From 51edf96a19cf5121946ccc19f5944f4dc901d265 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 15:29:23 -0400 Subject: [PATCH 20/33] Fix example Signed-off-by: Thomas Parnell --- vllm/lora/peft_helper.py | 4 ++-- vllm/v1/engine/processor.py | 17 +---------------- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index dbddd4c21b5e..f28365d5feb4 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -35,8 +35,8 @@ class PEFTHelper: use_rslora: bool = field(default=False) # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) use_dora: bool = field(default=False) - # Invocation string for Activated LoRA (aLoRA, see: https://arxiv.org/abs/2504.12397) - invocation_string: Optional[str] = field(default=None) + # Invocation tokens for Activated LoRA (aLoRA, see: https://arxiv.org/abs/2504.12397) + alora_invocation_tokens: Optional[list[int]] = field(default=None) # Extra vllm field, start with 'vllm_' to avoid conflict vllm_lora_scaling_factor: float = field(default=1.0) vllm_max_position_embeddings: Optional[int] = field(default=False) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 346be3cad187..7916a8c711e4 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -442,37 +442,22 @@ def process_inputs( if peft_helper.alora_invocation_tokens is not None: invocation_tokens = peft_helper.alora_invocation_tokens - delta = 0 - elif peft_helper.invocation_string is not None: - # backwards compatibility - invocation_tokens = self.input_preprocessor._tokenize_prompt( - peft_helper.invocation_string, - lora_request=lora_request, - tokenization_kwargs=tokenization_kwargs) - delta = 1 - else: - invocation_tokens = None - - if invocation_tokens is not None: invocation_start = -1 n = len(invocation_tokens) token_ids = decoder_inputs["prompt_token_ids"] - if n > 0 and len(token_ids) >= n: # scan backward for the last match # (faster than full forward scan+max) for idx in range(len(token_ids) - n, -1, -1): if token_ids[idx:idx + n] == invocation_tokens: # weights activated after start - invocation_start = idx + delta + invocation_start = idx break - if invocation_start == -1: raise ValueError( "Invocation sequence not found in prompt " f"for request '{request_id}'. aLoRA models require the " "invocation tokens to be present in the input.") - lora_request.invocation_start = invocation_start return decoder_inputs.get("prompt"), EngineCoreRequest( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 37fd3e63ca16..b278b6254ca5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1031,8 +1031,8 @@ def _prepare_inputs( alora_metadata = None return (attn_metadata, logits_indices, spec_decode_metadata, - num_scheduled_tokens, spec_decode_common_attn_metadata, - alora_metadata, max_num_scheduled_tokens) + alora_metadata, num_scheduled_tokens, + spec_decode_common_attn_metadata, max_num_scheduled_tokens) def _compute_cascade_attn_prefix_len( self, From a9d5986c2ec1a56a051f63b545b9aca260ad1a68 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 15:40:47 -0400 Subject: [PATCH 21/33] Inject aLoRA behaviour via mixin Signed-off-by: Thomas Parnell --- vllm/lora/layers.py | 19 ++----------------- vllm/lora/utils.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index ed0e43cb8b3c..512a413e58b4 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -874,8 +874,7 @@ def can_replace_layer( model_config: Optional[PretrainedConfig], ) -> bool: return (type(source_layer) is QKVParallelLinear - and len(packed_modules_list) == 3 - and not lora_config.activated_lora_enabled) + and len(packed_modules_list) == 3) #TODO: Implement this @@ -1194,8 +1193,7 @@ def can_replace_layer( return False -class MergedQKVParallelLinearWithActivatedLoRA(MergedQKVParallelLinearWithLoRA - ): +class ActivatedLoRAMixin: def apply(self, x: torch.Tensor, @@ -1226,16 +1224,3 @@ def apply(self, # Apply alora mask final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d) return final_output - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - """Returns True if the layer can be replaced by this LoRA layer.""" - return (type(source_layer) is QKVParallelLinear - and len(packed_modules_list) == 3 - and lora_config.activated_lora_enabled) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 959bdf23b181..af62d0cfb473 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -21,10 +21,10 @@ # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable -from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, +from vllm.lora.layers import (ActivatedLoRAMixin, BaseLayerWithLoRA, + ColumnParallelLinearWithLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithActivatedLoRA, MergedQKVParallelLinearWithLoRA, QKVParallelLinearWithLoRA, ReplicatedLinearWithLoRA, @@ -48,7 +48,6 @@ MergedColumnParallelLinearWithLoRA, QKVParallelLinearWithLoRA, MergedQKVParallelLinearWithLoRA, - MergedQKVParallelLinearWithActivatedLoRA, RowParallelLinearWithLoRA, ReplicatedLinearWithLoRA, LogitsProcessorWithLoRA, @@ -71,6 +70,12 @@ def from_layer(layer: nn.Module, lora_config=lora_config, packed_modules_list=packed_modules_list, model_config=model_config): + # inject a-LoRA behaviour + if (lora_config.activated_lora_enabled + and issubclass(lora_cls, BaseLayerWithLoRA)): + lora_cls = type( + lora_cls.__name__.replace("LoRA", "ActivatedLoRA"), + (ActivatedLoRAMixin, lora_cls), {}) instance_layer = lora_cls(layer) instance_layer.create_lora_weights(max_loras, lora_config, model_config) From 6fbc10893a844a1e90fc003734a09cd8f1524a39 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:14:55 -0400 Subject: [PATCH 22/33] Linting Signed-off-by: Thomas Parnell --- vllm/v1/worker/gpu_model_runner.py | 32 +++++----------------- vllm/v1/worker/lora_model_runner_mixin.py | 33 +++++++++++++++++++++++ 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b278b6254ca5..7cc9c8943478 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -303,7 +303,6 @@ def __init__( self.query_start_loc = self._make_buffer(self.max_num_reqs + 1, dtype=torch.int32) self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32) - # Because inputs_embeds may be bfloat16 and we don't need a numpy # version of this tensor, avoid a RuntimeError by not creating a # numpy buffer. @@ -1008,27 +1007,12 @@ def _prepare_inputs( self.set_active_loras(self.input_batch, num_scheduled_tokens) # Compute aLoRA metadata + alora_metadata = None if self.lora_config and self.lora_config.activated_lora_enabled: - invocation_start = np.empty(shape=(num_reqs, ), dtype=int) - for req_id in self.input_batch.req_ids: - req_index = self.input_batch.req_id_to_index[req_id] - cached_lora_request = self.requests[req_id].lora_request - if (cached_lora_request is not None - and cached_lora_request.invocation_start is not None): - invocation_start[ - req_index] = cached_lora_request.invocation_start - else: - invocation_start[req_index] = len( - self.requests[req_id].prompt_token_ids) - mask1d_cpu = torch.tensor(positions_np - < invocation_start[req_indices], - dtype=torch.bool, - device="cpu") - mask1d = self.mask1d[:total_num_scheduled_tokens] - mask1d.copy_(mask1d_cpu, non_blocking=True) - alora_metadata = ALoRAMetadata(mask1d=mask1d) - else: - alora_metadata = None + alora_metadata = self.build_alora_metadata( + num_reqs, positions_np, req_indices, + total_num_scheduled_tokens, self.input_batch, self.requests, + self.mask1d) return (attn_metadata, logits_indices, spec_decode_metadata, alora_metadata, num_scheduled_tokens, @@ -2648,10 +2632,8 @@ def _dummy_run( alora_metadata = None if self.lora_config and self.lora_config.activated_lora_enabled: - mask1d = self.mask1d[:num_tokens] - alora_metadata = ALoRAMetadata(mask1d=mask1d) - # needed to avoid guard failures - torch._dynamo.mark_dynamic(alora_metadata.mask1d, 0) + alora_metadata = self.build_dummy_alora_metadata( + num_tokens, self.mask1d) with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 4b5f27d27541..c000ca9f28db 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -12,11 +12,13 @@ import torch.nn as nn from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig +from vllm.forward_context import ALoRAMetadata from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor.models import supports_lora, supports_multimodal +from vllm.v1.worker.gpu_input_batch import CachedRequestState from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch @@ -86,6 +88,37 @@ def set_active_loras(self, input_batch: InputBatch, return self._set_active_loras(prompt_lora_mapping, token_lora_mapping, lora_requests) + def build_alora_metadata(self, num_reqs: int, positions_np: np.ndarray, + req_indices: np.ndarray, + total_num_scheduled_tokens: int, + input_batch: InputBatch, + requests: dict[str, CachedRequestState], + mask1d: torch.Tensor) -> ALoRAMetadata: + invocation_start = np.empty(shape=(num_reqs, ), dtype=int) + for req_id in input_batch.req_ids: + req_index = input_batch.req_id_to_index[req_id] + cached_lora_request = requests[req_id].lora_request + if (cached_lora_request is not None + and cached_lora_request.invocation_start is not None): + invocation_start[ + req_index] = cached_lora_request.invocation_start + else: + invocation_start[req_index] = len( + requests[req_id].prompt_token_ids) + mask1d_cpu = torch.tensor(positions_np < invocation_start[req_indices], + dtype=torch.bool, + device="cpu") + mask1d = mask1d[:total_num_scheduled_tokens] + mask1d.copy_(mask1d_cpu, non_blocking=True) + return ALoRAMetadata(mask1d=mask1d) + + def build_dummy_alora_metadata(self, num_tokens: int, + mask1d: torch.tensor): + alora_metadata = ALoRAMetadata(mask1d=mask1d[:num_tokens]) + # needed to avoid guard failures + torch._dynamo.mark_dynamic(alora_metadata.mask1d, 0) + return alora_metadata + @contextmanager def maybe_setup_dummy_loras(self, lora_config: Optional[LoRAConfig], From cb373e958f9be4aae2c72e77b525b26684deb929 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:18:59 -0400 Subject: [PATCH 23/33] lint Signed-off-by: Thomas Parnell --- vllm/lora/layers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 512a413e58b4..ad72b6056502 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1195,6 +1195,13 @@ def can_replace_layer( class ActivatedLoRAMixin: + base_layer: LinearBase + punica_wrapper: PunicaWrapperBase + lora_a_stacked: torch.tensor + lora_b_stacked: torch.tensor + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] + output_slices: tuple[int, ...] + def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: From 24dfc4a39b42086d41ccb02db621818b39fff4ca Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:22:53 -0400 Subject: [PATCH 24/33] add todo Signed-off-by: Thomas Parnell --- vllm/lora/layers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index ad72b6056502..4453760312c3 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1221,6 +1221,7 @@ def apply(self, mask2d = mask1d.unsqueeze(1).to(output.dtype) # Clone base layer output before running LoRA + # TODO(tdoublep): pass in mask1d and only operate on valid entries orig_out = output.clone() # Apply LoRA in‐place on `output`: From 6c1b46a123fcd9e8e848dc75e31290bca708dc78 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:34:19 -0400 Subject: [PATCH 25/33] Reorganize LoRA examples Signed-off-by: Thomas Parnell --- examples/offline_inference/lora/README.md | 24 +++++++++++++++++++ .../lora/activated_lora.py} | 1 + .../lora_with_quantization_inference.py | 0 .../{ => lora}/multilora_inference.py | 0 4 files changed, 25 insertions(+) create mode 100644 examples/offline_inference/lora/README.md rename examples/{alora/alora_offline_example.py => offline_inference/lora/activated_lora.py} (98%) rename examples/offline_inference/{ => lora}/lora_with_quantization_inference.py (100%) rename examples/offline_inference/{ => lora}/multilora_inference.py (100%) diff --git a/examples/offline_inference/lora/README.md b/examples/offline_inference/lora/README.md new file mode 100644 index 000000000000..2b4d501c3b42 --- /dev/null +++ b/examples/offline_inference/lora/README.md @@ -0,0 +1,24 @@ +# LoRA Examples + +This folder contains examples of offline inference using LoRA. + +## Multi-LoRA +This example shows how to use the multi-LoRA functionality: +``` +python examples/offline_inference/lora/multilora_inference.py +``` + +## LoRA with Quantization +This example shows how to use LoRA with different quantization techniques: +``` +python examples/offline_inference/lora/lora_with_quantization_inference.py +``` + +## Activated LoRA +This example how to use [activated LoRA](https://arxiv.org/abs/2504.12397): +``` +python examples/offline_inference/lora/activated_lora.py +``` + + + diff --git a/examples/alora/alora_offline_example.py b/examples/offline_inference/lora/activated_lora.py similarity index 98% rename from examples/alora/alora_offline_example.py rename to examples/offline_inference/lora/activated_lora.py index 4133ac341d23..98cec0a64cf0 100644 --- a/examples/alora/alora_offline_example.py +++ b/examples/offline_inference/lora/activated_lora.py @@ -29,6 +29,7 @@ enable_activated_lora=True, dtype=torch.bfloat16, max_lora_rank=64, + enforce_eager=True, ) prompts = [ diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora/lora_with_quantization_inference.py similarity index 100% rename from examples/offline_inference/lora_with_quantization_inference.py rename to examples/offline_inference/lora/lora_with_quantization_inference.py diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/lora/multilora_inference.py similarity index 100% rename from examples/offline_inference/multilora_inference.py rename to examples/offline_inference/lora/multilora_inference.py From b8444d972499e15a8064d239bbeadb555be09a85 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:38:01 -0400 Subject: [PATCH 26/33] Lint Signed-off-by: Thomas Parnell --- examples/offline_inference/lora/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/offline_inference/lora/README.md b/examples/offline_inference/lora/README.md index 2b4d501c3b42..69ecad549f6c 100644 --- a/examples/offline_inference/lora/README.md +++ b/examples/offline_inference/lora/README.md @@ -19,6 +19,3 @@ This example how to use [activated LoRA](https://arxiv.org/abs/2504.12397): ``` python examples/offline_inference/lora/activated_lora.py ``` - - - From 4e513cc5c7471c8b8853db38f536553d4e2972c1 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:39:27 -0400 Subject: [PATCH 27/33] lint Signed-off-by: Thomas Parnell --- vllm/lora/layers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 4453760312c3..1d0c616f9000 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -4,7 +4,7 @@ # pylint: disable=unused-argument import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Union, cast +from typing import Optional, Union, cast import torch import torch.nn as nn @@ -20,6 +20,7 @@ tensor_model_parallel_all_reduce) from vllm.distributed.utils import divide from vllm.forward_context import get_forward_context +from vllm.lora.punica_wrapper import PunicaWrapperBase # yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, @@ -33,9 +34,6 @@ VocabParallelEmbedding) from vllm.platforms import current_platform -if TYPE_CHECKING: - from vllm.lora.punica_wrapper import PunicaWrapperBase - def _get_lora_device(base_layer: nn.Module) -> torch.device: # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34 From b9df31f69638bf6d2388a4a08f9f62eb52d557a8 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:41:31 -0400 Subject: [PATCH 28/33] more lint Signed-off-by: Thomas Parnell --- examples/offline_inference/lora/README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/offline_inference/lora/README.md b/examples/offline_inference/lora/README.md index 69ecad549f6c..afe5bda548c2 100644 --- a/examples/offline_inference/lora/README.md +++ b/examples/offline_inference/lora/README.md @@ -1,21 +1,27 @@ # LoRA Examples -This folder contains examples of offline inference using LoRA. +This folder contains examples of offline inference using LoRA. ## Multi-LoRA + This example shows how to use the multi-LoRA functionality: -``` + +```bash python examples/offline_inference/lora/multilora_inference.py ``` ## LoRA with Quantization + This example shows how to use LoRA with different quantization techniques: -``` + +```bash python examples/offline_inference/lora/lora_with_quantization_inference.py ``` ## Activated LoRA + This example how to use [activated LoRA](https://arxiv.org/abs/2504.12397): -``` + +```bash python examples/offline_inference/lora/activated_lora.py ``` From 643d89357bfbde6b3a110852d5edee26959a3b82 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 8 Sep 2025 16:48:03 -0400 Subject: [PATCH 29/33] Cleanup example Signed-off-by: Thomas Parnell --- examples/offline_inference/lora/activated_lora.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/offline_inference/lora/activated_lora.py b/examples/offline_inference/lora/activated_lora.py index 98cec0a64cf0..be469611d4f6 100644 --- a/examples/offline_inference/lora/activated_lora.py +++ b/examples/offline_inference/lora/activated_lora.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import time import torch @@ -14,8 +13,6 @@ ALORA_NAME = "ibm-granite/granite-3.2-8b-alora-uncertainty" invocation_string = "<|start_of_role|>certainty<|end_of_role|>" -os.environ["VLLM_USE_V1"] = "1" - # download your LoRA adapter to ~/.cache/huggingface/… alora_path = snapshot_download(repo_id=ALORA_NAME) @@ -29,7 +26,6 @@ enable_activated_lora=True, dtype=torch.bfloat16, max_lora_rank=64, - enforce_eager=True, ) prompts = [ From 03b64801faec02885f969cdc3f2855b9c988c3b1 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 11 Sep 2025 06:14:39 -0400 Subject: [PATCH 30/33] Refactor according to new structure Signed-off-by: Thomas Parnell --- vllm/lora/layers/__init__.py | 2 + vllm/lora/layers/activated_linear.py | 63 ++++++++++++++++++++++++++++ vllm/lora/utils.py | 11 ++--- 3 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 vllm/lora/layers/activated_linear.py diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py index d3bb145dc7bf..23b6a65881ba 100644 --- a/vllm/lora/layers/__init__.py +++ b/vllm/lora/layers/__init__.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.lora.layers.activated_linear import LinearLayerWithActivatedLoRAMixin from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.layers.column_parallel_linear import ( ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA, @@ -31,4 +32,5 @@ "RowParallelLinearWithShardedLoRA", "ReplicatedLinearWithLoRA", "LoRAMapping", + "LinearLayerWithActivatedLoRAMixin", ] diff --git a/vllm/lora/layers/activated_linear.py b/vllm/lora/layers/activated_linear.py new file mode 100644 index 000000000000..894d7330039a --- /dev/null +++ b/vllm/lora/layers/activated_linear.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm.forward_context import get_forward_context +from vllm.lora.punica_wrapper import PunicaWrapperBase + +from .base_linear import BaseLinearLayerWithLoRA + +if TYPE_CHECKING: + from .base import BaseLayerWithLoRA + + +class LinearLayerWithActivatedLoRAMixin: + + base_layer: BaseLinearLayerWithLoRA + punica_wrapper: PunicaWrapperBase + lora_a_stacked: torch.tensor + lora_b_stacked: torch.tensor + lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] + output_slices: tuple[int, ...] + + @classmethod + def maybe_mixin(cls, lora_cls: "BaseLayerWithLoRA"): + if issubclass(lora_cls, BaseLinearLayerWithLoRA): + return type(lora_cls.__name__.replace("LoRA", "ActivatedLoRA"), + (cls, lora_cls), {}) + else: + return lora_cls + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + + # In transformers backend, x and output have extra batch dimension like + # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), + # therefore we need to flatten the batch dimensions. + if x.ndim == 3 and output.ndim == 3: + output = output.flatten(0, 1) + x = x.flatten(0, 1) + + # Extract aLoRA batch metadata from forward context + alora_metadata = get_forward_context().alora_metadata + + mask1d = alora_metadata.mask1d + mask2d = mask1d.unsqueeze(1).to(output.dtype) + + # Clone base layer output before running LoRA + # TODO(tdoublep): pass in mask1d and only operate on valid entries + orig_out = output.clone() + + # Apply LoRA in‐place on `output`: + self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, + self.lora_b_stacked, + self.lora_bias_stacked, 1.0, + self.output_slices) + # Apply alora mask + final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d) + return final_output diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 99aae6355295..bf122e6302f4 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -16,8 +16,7 @@ # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable -from vllm.lora.layers import (BaseLayerWithLoRA, BaseLinearLayerWithLoRA, - ColumnParallelLinearWithLoRA, +from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA, LinearLayerWithActivatedLoRAMixin, LogitsProcessorWithLoRA, @@ -72,11 +71,9 @@ def from_layer(layer: nn.Module, packed_modules_list=packed_modules_list, model_config=model_config): # inject a-LoRA behaviour - if (lora_config.activated_lora_enabled - and issubclass(lora_cls, BaseLinearLayerWithLoRA)): - lora_cls = type( - lora_cls.__name__.replace("LoRA", "ActivatedLoRA"), - (LinearLayerWithActivatedLoRAMixin, lora_cls), {}) + if lora_config.activated_lora_enabled: + lora_cls = LinearLayerWithActivatedLoRAMixin.maybe_mixin( + lora_cls) instance_layer = lora_cls(layer) instance_layer.create_lora_weights(max_loras, lora_config, model_config) From 76744da509640f0cdef32ba96cb7fe2577cff370 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 11 Sep 2025 12:16:44 +0200 Subject: [PATCH 31/33] Apply suggestions from code review Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Thomas Parnell --- vllm/config/__init__.py | 2 +- vllm/engine/arg_utils.py | 6 +++--- vllm/lora/peft_helper.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index d2be7fabd414..325c8f05b245 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2440,7 +2440,7 @@ class LoRAConfig: bias_enabled: bool = False """[DEPRECATED] Enable bias for LoRA adapters. This option will be removed in v0.12.0.""" - activated_lora_enabled: bool = False + enable_activated_lora: bool = False """Enable Activated LoRA.""" def compute_hash(self) -> str: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bdef2e7a397c..c3e5cc8ce6a2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -373,7 +373,7 @@ class EngineArgs: # LoRA fields enable_lora: bool = False enable_lora_bias: bool = LoRAConfig.bias_enabled - enable_activated_lora: bool = LoRAConfig.activated_lora_enabled + enable_activated_lora: bool = LoRAConfig.enable_activated_lora max_loras: int = LoRAConfig.max_loras max_lora_rank: int = LoRAConfig.max_lora_rank default_mm_loras: Optional[Dict[str, str]] = \ @@ -796,7 +796,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: lora_group.add_argument("--enable-lora-bias", **lora_kwargs["bias_enabled"]) lora_group.add_argument("--enable-activated-lora", - **lora_kwargs["activated_lora_enabled"]) + **lora_kwargs["enable_activated_lora"]) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) @@ -1372,7 +1372,7 @@ def create_engine_config( lora_config = LoRAConfig( bias_enabled=self.enable_lora_bias, - activated_lora_enabled=self.enable_activated_lora, + enable_activated_lora=self.enable_activated_lora, max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, default_mm_loras=self.default_mm_loras, diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index f28365d5feb4..de11c750d7d1 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -36,7 +36,7 @@ class PEFTHelper: # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) use_dora: bool = field(default=False) # Invocation tokens for Activated LoRA (aLoRA, see: https://arxiv.org/abs/2504.12397) - alora_invocation_tokens: Optional[list[int]] = field(default=None) + alora_invocation_tokens: Optional[list[int]] = None # Extra vllm field, start with 'vllm_' to avoid conflict vllm_lora_scaling_factor: float = field(default=1.0) vllm_max_position_embeddings: Optional[int] = field(default=False) From 6b83cc4268ca073de4b28439e751d07ced2438f7 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 11 Sep 2025 06:24:33 -0400 Subject: [PATCH 32/33] Fix a few naming issues Signed-off-by: Thomas Parnell --- vllm/lora/utils.py | 2 +- vllm/model_executor/layers/linear.py | 2 +- vllm/v1/engine/processor.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index bf122e6302f4..c52ee583bb32 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -71,7 +71,7 @@ def from_layer(layer: nn.Module, packed_modules_list=packed_modules_list, model_config=model_config): # inject a-LoRA behaviour - if lora_config.activated_lora_enabled: + if lora_config.enable_activated_lora: lora_cls = LinearLayerWithActivatedLoRAMixin.maybe_mixin( lora_cls) instance_layer = lora_cls(layer) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 22a9c8fe247b..23ba302a77b4 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -243,7 +243,7 @@ def __init__( vllm_config = get_current_vllm_config() if (vllm_config.lora_config - and vllm_config.lora_config.activated_lora_enabled): + and vllm_config.lora_config.enable_activated_lora): # lets torch.compile know that forward_context needs to be # considered as an input to the layer (copied from attention) compilation_config = vllm_config.compilation_config diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 477422cf0d30..46b30f04d9de 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -425,7 +425,7 @@ def process_inputs( mm_position=decoder_mm_positions[modality][idx])) # Handle aLoRA invocation sequence if applicable. - if (self.lora_config and self.lora_config.activated_lora_enabled + if (self.lora_config and self.lora_config.enable_activated_lora and lora_request is not None): text_config = self.model_config.hf_config.get_text_config() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 02f17e154a1b..939b53752a7c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -329,7 +329,7 @@ def __init__( self.num_accepted_tokens = self._make_buffer(self.max_num_reqs, dtype=torch.int64) - if self.lora_config and self.lora_config.activated_lora_enabled: + if self.lora_config and self.lora_config.enable_activated_lora: self.mask1d = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -1101,7 +1101,7 @@ def _prepare_inputs( # Compute aLoRA metadata alora_metadata = None - if self.lora_config and self.lora_config.activated_lora_enabled: + if self.lora_config and self.lora_config.enable_activated_lora: alora_metadata = self.build_alora_metadata( num_reqs, positions_np, req_indices, total_num_scheduled_tokens, self.input_batch, self.requests, @@ -2816,7 +2816,7 @@ def _dummy_run( f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.") alora_metadata = None - if self.lora_config and self.lora_config.activated_lora_enabled: + if self.lora_config and self.lora_config.enable_activated_lora: alora_metadata = self.build_dummy_alora_metadata( num_tokens, self.mask1d) From 18397d783411bc86084c672e8e2d36259aead4f5 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 11 Sep 2025 06:55:13 -0400 Subject: [PATCH 33/33] lint Signed-off-by: Thomas Parnell --- vllm/lora/layers/activated_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/layers/activated_linear.py b/vllm/lora/layers/activated_linear.py index 894d7330039a..adf2d299d009 100644 --- a/vllm/lora/layers/activated_linear.py +++ b/vllm/lora/layers/activated_linear.py @@ -24,7 +24,7 @@ class LinearLayerWithActivatedLoRAMixin: output_slices: tuple[int, ...] @classmethod - def maybe_mixin(cls, lora_cls: "BaseLayerWithLoRA"): + def maybe_mixin(cls, lora_cls: "type[BaseLayerWithLoRA]"): if issubclass(lora_cls, BaseLinearLayerWithLoRA): return type(lora_cls.__name__.replace("LoRA", "ActivatedLoRA"), (cls, lora_cls), {})