vllm-project · yaochengji · May 28, 2025 · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -122,10 +122,8 @@ run_and_track_test 11 "test_struct_output_generate.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
 run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-
-# Disable the TPU LoRA tests until the feature is activated
-# run_and_track_test 13 "test_lora (directory)" \
-#     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/"
+run_and_track_test 13 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then

diff --git a/tests/tpu/lora/test_pallas_kernels.py b/tests/tpu/lora/test_pallas_kernels.py
@@ -200,7 +200,7 @@ def from_local_checkpoint(
             weights_mapper: Optional[WeightsMapper] = None,
             tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
-        
+
         Args:
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
@@ -620,7 +620,7 @@ def _match_target_modules(self, module_name: str):
     def _filter_unsupported_mm_module(self, module_name: str) -> bool:
         """
         Regarding multimodal models, vLLM currently only supports adding LoRA to
-        language model. LoRA for other modules, such as the vision tower, will 
+        language model. LoRA for other modules, such as the vision tower, will
         be filtered out.
         """
         if self.supports_mm:

@@ -1,63 +1,99 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import jax
+import jax.numpy as jnp
 import torch
+import torch.nn.functional as F
+import torch_xla.core.xla_builder as xb
+from torch.library import impl
+from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard
 
-# Required to register the custom ops
-import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
 
+@jax.jit
+def bgmv_jax(inputs, loras, idxs):
+    return jnp.einsum(
+        "td,tX,Xld->tl",
+        inputs,
+        jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype),
+        loras,
+    )
 
-def bgmv_expand(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                add_inputs: bool = True):
+
+XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor")
+
+
+@impl(XLA_LIB, "bgmv", "XLA")
+def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor):
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+
+    jax_import_guard()
+    return xb.call_jax(bgmv_jax, (inputs, loras, idxs))
+
+
+@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd")
+def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor,
+                 idxs: torch.IntTensor):
+    T, _ = inputs.shape
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+    _, L, _ = loras.shape
+
+    return torch.empty((T, L), device=inputs.device)
+
+
+def bgmv_expand(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    add_inputs: bool = True,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
-        
-        output_tensor (torch.Tensor): output tensor of shape 
+
+        output_tensor (torch.Tensor): output tensor of shape
             [num_tokens, hidden_size * num_slices].
-        
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output 
+        add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
 
     outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-    n_tokens = outputs.size(0)
 
     limit = output_tensor.shape[0]
     if outputs.shape[0] == 1 and output_tensor.shape[0] != 1:
         limit = 1
 
-    outputs = torch.cat(
-        (outputs,
-         torch.zeros((n_tokens, output_tensor.shape[1] - outputs.shape[1]),
-                     device=outputs.device)),
-        dim=1)
+    if output_tensor.shape[1] > outputs.shape[1]:
+        outputs = F.pad(outputs,
+                        (0, output_tensor.shape[1] - outputs.shape[1], 0, 0))
 
     if add_inputs:
-        return output_tensor + outputs[:limit, :]
+        return output_tensor + outputs[:limit, :output_tensor.shape[1]]
     else:
-        return outputs[:limit, :]
+        return outputs[:limit, :output_tensor.shape[1]]
 
 
-def bgmv_shrink(inputs: torch.Tensor,
-                lora_b_weights: torch.Tensor,
-                output_tensor: torch.Tensor,
-                lora_indices_tensor: torch.Tensor,
-                scaling: float = 1.0):
+def bgmv_shrink(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    scaling: float = 1.0,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
         output_tensor (torch.Tensor): (Unused) output tensor (placeholder).
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
         scaling (float, optional): Scalar multiplier applied to the output.
     """
@@ -66,39 +102,41 @@ def bgmv_shrink(inputs: torch.Tensor,
                                         lora_indices_tensor)
 
 
-def bgmv_expand_slice(inputs: torch.Tensor,
-                      lora_b_weights: torch.Tensor,
-                      output_tensor: torch.Tensor,
-                      lora_indices_tensor: torch.Tensor,
-                      slice_offset: int,
-                      slice_size: int,
-                      add_inputs: bool = True):
+def bgmv_expand_slice(
+    inputs: torch.Tensor,
+    lora_b_weights: torch.Tensor,
+    output_tensor: torch.Tensor,
+    lora_indices_tensor: torch.Tensor,
+    slice_offset: int,
+    slice_size: int,
+    add_inputs: bool = True,
+):
     """
     Args:
         inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size].
-        
-        lora_b_weights (torch.Tensor): LoRA weights of shape 
+
+        lora_b_weights (torch.Tensor): LoRA weights of shape
             [num_loras, lora_rank, hidden_size].
-        
-        output_tensor (torch.Tensor): output tensor of shape 
+
+        output_tensor (torch.Tensor): output tensor of shape
             [num_tokens, hidden_size * num_slices].
-        
-        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] 
+
+        lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens]
             indicating which LoRA matrix to use for each token.
-        add_inputs (bool): Whether or not to add the input tensor to the output 
+        add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
     outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor)
-    n_tokens = outputs.size(0)
 
-    outputs = torch.cat((
-        torch.zeros((n_tokens, slice_offset), device=outputs.device),
+    outputs = F.pad(
         outputs,
-        torch.zeros(
-            (n_tokens, output_tensor.shape[1] - (slice_offset + slice_size)),
-            device=outputs.device),
-    ),
-                        dim=1)
+        (
+            slice_offset,
+            output_tensor.shape[1] - (slice_offset + slice_size),
+            0,
+            0,
+        ),
+    )
 
     if add_inputs:
         return output_tensor + outputs