[Bug fix] Pathways HBM calculation with DP enabled + Async scheduler precompilation (#1072)

wenxindongwork · web-flow · commit 70ebcc95190d · 2025-11-11T13:01:23.000-08:00
Signed-off-by: wenxindongwork &lt;wenxindong@google.com&gt;
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -75,28 +75,36 @@ def test_hbm_usage_bytes_pathways_enabled(mock_devices, mock_live_arrays):
     mock_device2 = MagicMock()
     devices = [mock_device1, mock_device2]
 
-    # Create mock arrays with sharding
+    # Create mock device buffers
+    mock_buffer1_dev1 = MagicMock()
+    mock_buffer1_dev1.device = mock_device1
+    mock_buffer1_dev1.nbytes = 2000  # 2000 bytes on device1
+
+    mock_buffer1_dev2 = MagicMock()
+    mock_buffer1_dev2.device = mock_device2
+    mock_buffer1_dev2.nbytes = 2000  # 2000 bytes on device2
+
+    mock_buffer2_dev1 = MagicMock()
+    mock_buffer2_dev1.device = mock_device1
+    mock_buffer2_dev1.nbytes = 1000  # 1000 bytes on device1
+
+    # Create mock arrays with device buffers
     mock_array1 = MagicMock()
-    mock_array1.dtype.itemsize = 4  # float32
-    mock_array1.size = 1000  # 1000 elements
-    mock_array1.sharding.device_set = {mock_device1, mock_device2
-                                       }  # Sharded across 2 devices
+    mock_array1.device_buffers = [mock_buffer1_dev1, mock_buffer1_dev2]
 
     mock_array2 = MagicMock()
-    mock_array2.dtype.itemsize = 2  # float16
-    mock_array2.size = 500  # 500 elements
-    mock_array2.sharding.device_set = {mock_device1}  # Only on device1
+    mock_array2.device_buffers = [mock_buffer2_dev1]
 
     mock_live_arrays.return_value = [mock_array1, mock_array2]
 
     usage = hbm_usage_bytes(devices)
 
     # Expected calculations:
-    # Array1: 4 bytes * 1000 elements / 2 devices = 2000 bytes per device
-    # Array2: 2 bytes * 500 elements / 1 device = 1000 bytes on device1 only
-    # Device1: 2000 + 1000 = 3000 bytes
-    # Device2: 2000 + 0 = 2000 bytes
-    # hbm_limit = 33550237184 (hardcoded in the function)
+    # Array1: 2000 bytes on device1, 2000 bytes on device2
+    # Array2: 1000 bytes on device1
+    # Device1 total: 2000 + 1000 = 3000 bytes
+    # Device2 total: 2000 + 0 = 2000 bytes
+    # hbm_limit = 95 * GBYTES for TPU v5p
     expected_usage = [(3000, 95 * GBYTES), (2000, 95 * GBYTES)]
     assert usage == expected_usage
 
@@ -127,7 +135,7 @@ def test_hbm_usage_gb_pathways_disabled():
 @patch("jax.devices")
 def test_hbm_usage_bytes_pathways_no_arrays(mock_devices, mock_live_arrays):
     """Tests hbm_usage_bytes when VLLM_TPU_USING_PATHWAYS is True but no live arrays."""
-    # Mock TPU v5e devices
+    # Mock TPU v6e devices
     mock_jax_device = MagicMock()
     mock_jax_device.device_kind = "TPU v6e"
     mock_devices.return_value = [mock_jax_device]
@@ -141,7 +149,8 @@ def test_hbm_usage_bytes_pathways_no_arrays(mock_devices, mock_live_arrays):
 
     usage = hbm_usage_bytes(devices)
 
-    # No arrays means no memory usage
+    # No arrays means no memory usage, defaultdict returns 0 for missing keys
+    # HBM limit for TPU v6e is 32 GB
     expected_usage = [(0, 32 * GBYTES), (0, 32 * GBYTES)]
     assert usage == expected_usage
 
diff --git a/tpu_inference/runner/compilation_manager.py b/tpu_inference/runner/compilation_manager.py
@@ -202,20 +202,21 @@ def _precompile_substitute_placeholder_token(self) -> None:
         """
 
         for num_tokens in self.runner.num_tokens_paddings:
-            padded_token_in_tpu_cur_input_indices = np.zeros((num_tokens, ),
-                                                             dtype=np.int32)
-            padded_token_in_tpu_pre_next_tokens_indices = np.zeros(
-                (num_tokens, ), dtype=jnp.int32)
-            (padded_token_in_tpu_cur_input_indices,
-             padded_token_in_tpu_pre_next_tokens_indices) = device_array(
-                 self.runner.mesh,
-                 (padded_token_in_tpu_cur_input_indices,
-                  padded_token_in_tpu_pre_next_tokens_indices))
             dp_sharding = NamedSharding(
                 self.runner.mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, )
             ) if self.runner.vllm_config.sharding_config.total_dp_size > 1 else None
 
             for num_reqs in self.runner.num_reqs_paddings:
+                padded_token_in_tpu_cur_input_indices = np.zeros(
+                    (num_tokens, ), dtype=np.int32)
+                padded_token_in_tpu_pre_next_tokens_indices = np.zeros(
+                    (num_tokens, ), dtype=jnp.int32)
+                (padded_token_in_tpu_cur_input_indices,
+                 padded_token_in_tpu_pre_next_tokens_indices) = device_array(
+                     self.runner.mesh,
+                     (padded_token_in_tpu_cur_input_indices,
+                      padded_token_in_tpu_pre_next_tokens_indices))
+
                 input_ids = self._create_dummy_tensor((num_tokens, ),
                                                       jnp.int32, dp_sharding)
                 # Need align to the sampling output
diff --git a/tpu_inference/utils.py b/tpu_inference/utils.py
@@ -132,12 +132,8 @@ def pathways_hbm_usage_gb(devices: Any) -> List[Tuple[float, float]]:
     hbm_used = defaultdict(int)
     hbm_limit = get_device_hbm_limit()
     for array in live_arrays:
-        assert hasattr(array, 'sharding') and hasattr(
-            array.sharding, 'device_set'
-        ), "This function must not be called within jax tracer (e.g. jit, vmap, grad)"
-        for device in array.sharding.device_set:
-            hbm_used[device] += array.dtype.itemsize * array.size // len(
-                array.sharding.device_set)
+        for buffer in array.device_buffers:
+            hbm_used[buffer.device] += buffer.nbytes
     return [(hbm_used[device], hbm_limit) for device in devices]