ok, the test passed. Need to make it simpler next.

vanbasten23 · vanbasten23 · commit 11c7ea281bc5 · 2025-11-05T21:10:53.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
@@ -248,6 +248,7 @@ def create_column_parallel_packed_layer():
             # self.jax_config.mesh.devices[0][0].platform
             jax_config = JaxCommonLinearConfig(vllm_config, mesh, base_linear)
             linear_method = VllmUnquantizedLinearMethod(jax_config)
+            base_linear.quant_method=linear_method
             linear_method.process_weights_after_loading(base_linear)
             # here base_linear.weight is on TPU and sharded.
             
@@ -263,6 +264,8 @@ def create_column_parallel_packed_layer():
             raise NotImplementedError("NYI: for QKVParallelLinear case")
 
         n_slices = repeats
+        #TODO(xw): check if we can enable torchax globally.
+        # TODO(xw): check if we can calculate both actual and expected output using torchax.
         with torchax.default_env():
             # create_lora_weights creates global shape weight.
             lora_linear.create_lora_weights(max_loras, lora_config)
@@ -282,10 +285,11 @@ def create_column_parallel_packed_layer():
 
     max_num_batched_tokens = 8192
     max_batches = 256
-    punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
-                                        max_batches,
-                                        device,
-                                        max_loras=max_loras)
+    with torchax.default_env():
+        punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
+                                            max_batches,
+                                            'jax',
+                                            max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
     lora_linear.set_mapping(punica_wrapper)
 
@@ -333,7 +337,8 @@ def create_column_parallel_packed_layer():
     with torchax.default_env():
         # lora_result = lora_linear(torch.cat(jax_inputs))[0]
         # lora_result = j2t(lora_result)
-        lora_result = linear_method.apply(lora_linear.base_layer, torch.cat(jax_inputs))
+        # lora_result = linear_method.apply(lora_linear.base_layer, torch.cat(jax_inputs))
+        lora_result = lora_linear(torch.cat(jax_inputs))[0]
 
     expected_results: list[torch.Tensor] = []
     for input_, lora_id in zip(inputs, prompt_mapping):
@@ -348,17 +353,18 @@ def create_column_parallel_packed_layer():
     expected_result = torch.cat(expected_results)
 
     rtol, atol = TOLERANCES[lora_result.dtype]
-    # with torchax.default_env():
-    #     torch.testing.assert_close(lora_result.to('cpu'),
-    #                                expected_result,
-    #                                rtol=rtol,
-    #                                atol=atol)
-    #     print(
-    #         f'Output max diff: {torch.max(torch.abs(expected_result.to('cpu') - lora_result))}'
-    #     )
-    #     print(
-    #         f'Output mean diff: {torch.mean(torch.abs(expected_result.to('cpu') - lora_result))}'
-    #     )
+    with torchax.default_env():
+        lora_result_cpu = lora_result.to('cpu')
+        torch.testing.assert_close(lora_result_cpu,
+                                   expected_result,
+                                   rtol=rtol,
+                                   atol=atol)
+        print(
+            f'Output max diff: {torch.max(torch.abs(expected_result - lora_result_cpu))}'
+        )
+        print(
+            f'Output mean diff: {torch.mean(torch.abs(expected_result - lora_result_cpu))}'
+        )
 
     # Check that resetting the lora weights succeeds
     # Here we set all lora weight to be empty.
diff --git a/tpu_inference/lora/torch_punica_tpu.py b/tpu_inference/lora/torch_punica_tpu.py
@@ -23,6 +23,8 @@ class PunicaWrapperTPU(PunicaWrapperBase):
     PunicaWrapperTPU is designed to manage and provide metadata for the punica
     kernel. The main function is to maintain the state information for
     Multi-LoRA, and to provide the interface for the pytorch punica ops.
+
+    It is created by get_punica_wrapper when we load_lora_model->create_lora_manager. Device is TPU.
     """
 
     def __init__(self, max_num_batched_tokens: int, max_batches: int,