add multi-chip test case

vanbasten23 · vanbasten23 · commit cc47babdf3c1 · 2025-11-05T21:12:13.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
@@ -157,6 +157,7 @@ steps:
        queue: tpu_v6e_queue
      commands:
        - |
+<<<<<<< HEAD
          if [[ "$$NIGHTLY" == "1" ]]; then
            .buildkite/scripts/run_in_docker.sh \
              bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py && \
@@ -165,6 +166,12 @@ steps:
            echo "Skipping: NIGHTLY environment variable not set"
            exit 0
          fi
+=======
+         .buildkite/scripts/run_in_docker.sh \
+           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py && \
+           python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_bgmv.py && \
+           python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_layers.py'
+>>>>>>> c17bacea (add multi-chip test case)
 
    - label: "E2E MLPerf tests for JAX + vLLM models on multiple chips"
      key: test_11
@@ -212,13 +219,19 @@ steps:
        queue: tpu_v6e_8_queue
      commands:
        - |
+<<<<<<< HEAD
          if [[ "$$NIGHTLY" == "1" ]]; then
            .buildkite/scripts/run_in_docker.sh \
              bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
          else
            echo "Skipping: NIGHTLY environment variable not set"
            exit 0
          fi
+=======
+         .buildkite/scripts/run_in_docker.sh \
+           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py && \
+                    python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_layers.py'
+>>>>>>> c17bacea (add multi-chip test case)
 
 
   # -----------------------------------------------------------------
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
@@ -221,10 +221,13 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     )
 
     axis_names = ("data", "model")
+    devices = jax.devices()
     mesh_shape = (
-        1, 1
+        1, len(devices)
+        # 1, 1
     )  # TODO(xiowei): support multi-chip: mesh_shape = (1, len(jax.devices()))
-    mesh = jax.make_mesh(mesh_shape, axis_names, devices=jax.devices())
+    print(f'xw32 mesh_shape: {mesh_shape}')
+    mesh = jax.make_mesh(mesh_shape, axis_names, devices=devices)
 
     def create_column_parallel_packed_layer():
         # We first create a base linear layer, then a lora layer to wrap it.
@@ -281,7 +284,10 @@ def create_column_parallel_packed_layer():
     with torchax.default_env():
         # lora_linear.weight has type torchax.tensor.Tensor
         # BaseLinearLayerWithLoRA.weight property guarantees this.
-        assert torch.equal(linear.weight, lora_linear.weight.to('cpu'))
+        # if len(devices) != 1, `reorder_concatenated_tensor_for_sharding` function may reorder the out_features dimension of the weight matrix.
+        # So the below check will fail.
+        if len(devices) == 1:
+            assert torch.equal(linear.weight.data, lora_linear.weight.to('cpu'))
 
     max_num_batched_tokens = 8192
     max_batches = 256