Initial working implementation of a-LoRA.

tdoublep · kgreenewald · lallison2 · tdoublep · commit 8a9610e62f53 · 2025-06-17T07:36:46.000Z
Co-authored-by: Greenewald &lt;kristjan.h.greenewald@ibm.com&gt;
Co-authored-by: Allison Li &lt;lallison@mit.edu&gt;
Signed-off-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/examples/alora/alora_server_testing.py b/examples/alora/alora_server_testing.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# After starting server using "vllm serve <model> --enable_lora --lora_modules..."
+
+import time
+
+from openai import OpenAI
+
+model_id = "ibm-granite/granite-3.2-8b-instruct"
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+BASE_NAME = "ibm-granite/granite-3.2-8b-instruct"
+ALORA_NAME = "new_alora"  # "ibm-granite/granite-3.2-8b-alora-uncertainty"
+invocation_string = "<|start_of_role|>certainty<|end_of_role|>"
+
+###################################################################
+prompts = [
+    "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>",
+    "What is MIT?",
+    (
+        "<|start_of_role|>user<|end_of_role|>What is the capital of "
+        "Massachusetts?<|end_of_text|>\n"
+    ),
+    "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>",
+    (
+        "<|start_of_role|>user<|end_of_role|>What is the capital of "
+        "Massachusetts?<|end_of_text|>\n"
+    ),
+    "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>",
+]
+
+# Base model call
+outputs_base = client.completions.create(
+    model=BASE_NAME, prompt=prompts, temperature=0, max_tokens=600
+)
+
+choices = outputs_base.choices
+generated_text = []
+for i in range(len(prompts)):
+    prompt = prompts[i]
+
+    generated_text += [outputs_base.choices[i].text]
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text[-1]!r}")
+
+prompts_alora = [
+    x + y + "<|end_of_text|>\n" + invocation_string
+    for x, y in zip(prompts, generated_text)
+]
+
+# Base model with aLoRA call
+t0 = time.time()
+alora_outputs = client.completions.create(
+    model=ALORA_NAME, prompt=prompts_alora, temperature=0, max_tokens=10
+)
+t = time.time() - t0
+print(f"Time: {t}")
+for i in range(len(prompts_alora)):
+    prompt = prompts_alora[i]
+    generated_text = alora_outputs.choices[i].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/alora/alora_server_testing.sh b/examples/alora/alora_server_testing.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# More documentation: https://docs.vllm.ai/en/v0.8.3/serving/openai_compatible_server.html#vllm-serve
+export VLLM_USE_V1="1"
+# Specify base model (and optionally loras) to load in when starting the server.
+vllm serve ibm-granite/granite-3.2-8b-instruct \
+    --enable-lora \
+    --lora-modules '{"name": "new_alora", "path": "/proj/dmfexp/statllm/users/kgreenewald/.cache/huggingface/models/hub/models--ibm-granite--granite-3.2-8b-alora-uncertainty/snapshots/6109ad88201426003e696d023ec67c19e7f3d444", "base_model_name": "ibm-granite/granite-3.2-8b-instruct"}' \
+    --dtype bfloat16 \
+    --max-lora-rank 64 \
+    --enable-prefix-caching
+#--no-enable-prefix-caching
+# Check that the lora model is listed along with other models.
+#curl localhost:8000/v1/models | jq .
+
+###########################################
+
+# A second option is to enable dynamic adapter loading instead of at start-up.
+#export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+
+#curl -X POST http://localhost:8000/v1/load_lora_adapter \
+#-H "Content-Type: application/json" \
+#-d '{
+#    "lora_name": "new_alora",
+#    "lora_path": "/path/to/new_alora"
+#}'
+# Should return "200 OK - Success: LoRA adapter 'new_alora' added successfully"
+
+# Example of dynamically unloading an adapter.
+# curl -X POST http://localhost:8000/v1/unload_lora_adapter \
+# -H "Content-Type: application/json" \
+# -d '{
+#     "lora_name": "new_alora"
+# }'
+
+###########################################
+
+# Send a request using the new aLoRA
+#curl http://localhost:8000/v1/completions \
+#    -H "Content-Type: application/json" \
+#    -d '{
+#        "model": "new_alora",
+#        "prompt": ""What is MIT?"",
+#        "max_tokens": 600,
+#        "temperature": 0
+#    }' | jq
diff --git a/examples/alora/new_alora_testing.py b/examples/alora/new_alora_testing.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import time
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+BASE_NAME = "ibm-granite/granite-3.2-8b-instruct"
+ALORA_NAME = "ibm-granite/granite-3.2-8b-alora-uncertainty"
+invocation_string = "<|start_of_role|>certainty<|end_of_role|>"
+
+os.environ["VLLM_USE_V1"] = "1"
+os.environ["VLLM_V1_USE_DEMO_LOGGING"] = "1"
+
+# download your LoRA adapter to ~/.cache/huggingface/…
+alora_path = snapshot_download(repo_id=ALORA_NAME)
+
+print(alora_path)
+#######################################
+
+
+llm = LLM(
+    model=BASE_NAME,
+    enable_lora=True,
+    enforce_eager=True,
+    dtype=torch.bfloat16,
+    enable_prefix_caching=True,  # enable APC
+    max_lora_rank=64,
+    enable_chunked_prefill=False,
+)
+
+prompts = [
+    (
+        "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>\n"
+        "<|start_of_role|>assistant<|end_of_role|>"
+    ),
+]
+
+sampling_params = SamplingParams(temperature=0, max_tokens=600)
+
+outputsBase = llm.generate(
+    prompts,
+    sampling_params,
+)
+generated_text = []
+for output in outputsBase:
+    prompt = output.prompt
+    generated_text += [output.outputs[0].text]
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text[-1]!r}")
+
+prompts_alora = [
+    x + y + "<|end_of_text|>\n" + invocation_string
+    for x, y in zip(prompts, generated_text)
+]
+
+sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+t0 = time.time()
+outputs = llm.generate(
+    prompts_alora,
+    sampling_params,
+    lora_request=LoRARequest("UQ_adapter", 1, alora_path),
+)
+t = time.time() - t0
+print(f"Time: {t}")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -120,6 +120,7 @@
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
+    VLLM_V1_USE_DEMO_LOGGING: bool = True
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
@@ -835,6 +836,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
     lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
 
+    # Useful for demo
+    "VLLM_V1_USE_DEMO_LOGGING":
+    lambda: os.environ.get("VLLM_V1_USE_DEMO_LOGGING", "0") == "1",
+
     # If set, allow insecure serialization using pickle.
     # This is useful for environments where it is deemed safe to use the
     # insecure method and it is needed for some reason.
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -26,6 +26,12 @@
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+@dataclass
+class ALoRAMetadata:
+    k_offsets: torch.Tensor
+    query_start_locs: list[int]
+
+
 @dataclass
 class DPMetadata:
     max_tokens_across_dp_cpu: torch.Tensor
@@ -94,6 +100,7 @@ class ForwardContext:
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
+    alora_metadata: Optional[ALoRAMetadata] = None
     skip_cuda_graphs: bool = False
 
 
@@ -116,6 +123,7 @@ def set_forward_context(
     num_tokens: Optional[int] = None,
     num_tokens_across_dp: Optional[torch.Tensor] = None,
     skip_cuda_graphs: bool = False,
+    alora_metadata: Optional[ALoRAMetadata] = None,
 ):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -140,6 +148,7 @@ def set_forward_context(
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
+        alora_metadata=alora_metadata,
         skip_cuda_graphs=skip_cuda_graphs,
     )
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -19,6 +19,7 @@
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 from vllm.distributed.utils import divide
+from vllm.forward_context import get_forward_context
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -418,14 +419,44 @@ def apply(self,
             output = output.flatten(0, 1)
             x = x.flatten(0, 1)
 
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_linear(
-                output, x, self.lora_a_stacked, self.lora_b_stacked,
-                self.lora_bias_stacked, 1.0, self.output_slices)
-        if not current_platform.can_update_inplace():
-            output = lora_output
-
-        return output
+        # Extract aLoRA batch metadata from forward context
+        alora_metadata = get_forward_context().alora_metadata
+        k_offsets = alora_metadata.k_offsets
+        query_start_locs = alora_metadata.query_start_locs
+
+        # Build the 1D “save‐prefix” mask:
+        T = output.size(0)  # total tokens
+        starts = query_start_locs[:-1]  # starts and end index of each request
+        ends = query_start_locs[1:]
+        lengths = ends - starts  # request lengths
+        kept_lens = lengths - k_offsets
+        kept_lens = torch.clamp(
+            kept_lens,
+            min=0)  # portion of request to keep as base model weights
+
+        device = output.device
+        # Create the alora mask
+        delta = torch.zeros(T + 1, device=device, dtype=output.dtype)
+        ends_for_scatter = starts + kept_lens
+        pos_vals = kept_lens.sign().to(output.dtype)
+        neg_vals = -pos_vals
+        delta.scatter_add_(0, starts, pos_vals)
+        delta.scatter_add_(0, ends_for_scatter, neg_vals)
+        cums = torch.cumsum(delta[:-1], dim=0)
+        mask1d = cums > 0  # shape [T], bool
+        mask2d = mask1d.unsqueeze(1).to(output.dtype)
+
+        # Clone base layer output before running LoRA
+        orig_out = output.clone()
+
+        # Apply LoRA in‐place on `output`:
+        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
+                                            self.lora_b_stacked,
+                                            self.lora_bias_stacked, 1.0,
+                                            self.output_slices)
+        # Apply alora mask
+        final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d)
+        return final_output
 
     @property
     def weight(self) -> torch.Tensor:
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
@@ -33,6 +33,8 @@ class LoRARequest(
     long_lora_max_len: Optional[int] = None
     base_model_name: Optional[str] = msgspec.field(default=None)
     tensorizer_config_dict: Optional[dict] = None
+    invocation_tokens: Optional[list[int]] = None
+    k_offset: Optional[int] = None
 
     def __post_init__(self):
         if self.lora_local_path:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -9,6 +9,7 @@
 import torch.nn as nn
 from torch.nn.parameter import Parameter, UninitializedParameter
 
+from vllm.config import get_current_vllm_config
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -229,6 +230,12 @@ def __init__(
     ):
         super().__init__()
 
+        # tpa -- find out why this is needed
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+
         # Keep input parameters
         self.input_size = input_size
         self.output_size = output_size
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -457,6 +457,20 @@ def hash_request_tokens(hash_function: Any, block_size: int,
     token_ids = request.all_token_ids
 
     req_need_extra_keys = need_extra_keys(request)
+    if (request.lora_request is not None
+            and request.lora_request.invocation_tokens is not None):
+        use_alora = True
+        invocation_tokens = request.lora_request.invocation_tokens
+        # scan backward for the last match (faster than full forward scan+max)
+        invocation_start = -1
+        n = len(invocation_tokens)
+        for idx in range(len(token_ids) - n, -1, -1):
+            if token_ids[idx:idx + n] == invocation_tokens:
+                # weights activated 1 token after start
+                invocation_start = idx + 1
+                break
+    else:
+        use_alora = False
     req_extra_keys = None
     curr_mm_idx = 0
 
@@ -473,6 +487,8 @@ def hash_request_tokens(hash_function: Any, block_size: int,
             # MM and LoRA requests need extra keys for block-hash computation.
             req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                 request, start, end, curr_mm_idx)
+            if use_alora and end <= invocation_start:
+                req_extra_keys = None  # cache is equivalent to base model cache
 
         block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
                                        block_token_ids, req_extra_keys)
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py