Add test for payload not in the workspace and fix coderabbit comments

djns99 · djns99 · commit 00d38ccff074 · 2025-11-28T15:42:07.000+13:00
diff --git a/csrc/trtllm_moe_alltoall.cu b/csrc/trtllm_moe_alltoall.cu
@@ -127,7 +127,6 @@ Tuple<Array<int64_t>, Array<int64_t>, int64_t> moeA2ADispatchOp(
     TensorView metainfo, int64_t runtimeMaxTokensPerRank, int64_t epRank, int64_t epSize,
     int64_t topK, int64_t numExperts) {
   using tl_throughput::PayloadDescriptor;
-  fflush(stdout);
 
   CHECK_INPUT(tokenSelectedExperts);
   CHECK_INPUT_TYPE(tokenSelectedExperts, dl_int32);
@@ -388,6 +387,10 @@ void moeA2ASanitizeExpertIdsOp(TensorView expertIds, TensorView workspace, Tenso
       static_cast<int32_t*>(expertIds.data_ptr()), recvCounters,
       static_cast<int32_t>(invalidExpertId), static_cast<int>(epSize),
       static_cast<int>(runtimeMaxTokensPerRank), static_cast<int>(topK), get_current_stream());
+
+  auto err = cudaGetLastError();
+  TVM_FFI_ICHECK(err == cudaSuccess)
+      << "moe_a2a_sanitize_expert_ids launch failed: " << cudaGetErrorString(err);
 }
 
 // Expose metainfo index constants for Python access
diff --git a/flashinfer/comm/trtllm_moe_alltoall.py b/flashinfer/comm/trtllm_moe_alltoall.py
@@ -178,6 +178,18 @@ def moe_a2a_get_workspace_size_per_rank(
         total_dispatch_payload_size_per_token: int,
         combine_payload_size_per_token: int,
     ):
+        """
+        Get the workspace size per rank for the MoeAlltoAll operation.
+
+        Args:
+            ep_size: Total expert parallel size
+            max_num_tokens: Maximum number of tokens across all ranks
+            total_dispatch_payload_size_per_token: The size of the payload per token in the dispatch phase. This should be the sum of all payloads tensors.
+            combine_payload_size_per_token: The size of the payload per token in the combine phase.
+
+        Returns:
+            workspace_size_per_rank: Size of the workspace per rank in bytes
+        """
         return module.moe_a2a_get_workspace_size_per_rank(
             ep_size,
             max_num_tokens,
@@ -218,15 +230,13 @@ def moe_a2a_wrap_payload_tensor_in_workspace(
 
     Args:
         workspace: [ep_size, size_per_rank] workspace tensor
-        ep_rank: Current expert parallel rank
-        ep_size: Total expert parallel size
-        runtime_max_tokens_per_rank: Max tokens per rank in this batch
-        total_size: Total size of the payload
-        offset: Offset from dispatch
-        dtype: Data type for the tensor
+        leading_shape: The leading shape to wrap the tensor with
+        slice_start: The start of the slice in the workspace
+        slice_end: The end of the slice in the workspace
+        dtype: Data type for the output tensor
 
     Returns:
-        tensor: [ep_size * max_tokens, hidden_size] workspace-backed tensor
+        tensor: [leading_shape, *] workspace-backed tensor
     """
     workspace_base = workspace.view(-1).view(dtype=torch.uint8)
     assert slice_end <= workspace.numel(), (
@@ -249,6 +259,24 @@ def moe_a2a_dispatch(
     top_k: int,
     num_experts: int,
 ):
+    """
+    Dispatch tokens and payloads to expert ranks.
+
+    Args:
+        token_selected_experts: [local_num_tokens, top_k] int32 tensor
+        input_payloads: List of [local_num_tokens, *] tensors to dispatch
+        workspace: [ep_size, size_per_rank] workspace tensor
+        metainfo: Metadata tensor from initialize
+        runtime_max_tokens_per_rank: Max tokens per rank in this batch
+        ep_rank: Current expert parallel rank
+        ep_size: Total expert parallel size
+        top_k: Number of experts per token
+        num_experts: Total number of experts
+
+    Returns:
+        output_payloads: List of payloads for this rank, backed by data in the workspace
+        combine_payload_offset: The offset to place the combine payload in the workspace
+    """
     recv_offsets, recv_sizes, combine_payload_offset = (
         get_mnnvl_moe_alltoall_module().moe_a2a_dispatch(
             token_selected_experts,
diff --git a/tests/comm/test_trtllm_moe_alltoall.py b/tests/comm/test_trtllm_moe_alltoall.py
@@ -49,12 +49,14 @@ def setup_test_environment():
 ]
 
 COMBINE_PARAMS = [
-    (2, 64, 8, 2, torch.bfloat16),  # Small input, 2 ranks
-    (4, 32, 32768, 4, torch.bfloat16),  # Large input, 4 ranks
-    (8, 16, 2048, 8, torch.bfloat16),  # Medium input, 8 ranks
-    (2, 64, 8, 2, torch.float16),  # Small input, 2 ranks
-    (4, 32, 32768, 4, torch.float16),  # Large input, 4 ranks
-    (8, 16, 2048, 8, torch.float16),  # Medium input, 8 ranks
+    (2, 64, 8, 2, torch.bfloat16, True),  # Small input, 2 ranks
+    (4, 32, 32768, 4, torch.bfloat16, True),  # Large input, 4 ranks
+    (8, 16, 2048, 8, torch.bfloat16, True),  # Medium input, 8 ranks
+    (8, 16, 2048, 8, torch.bfloat16, False),  # Medium input, 8 ranks
+    (2, 64, 8, 2, torch.float16, True),  # Small input, 2 ranks
+    (4, 32, 32768, 4, torch.float16, True),  # Large input, 4 ranks
+    (8, 16, 2048, 8, torch.float16, True),  # Medium input, 8 ranks
+    (8, 16, 2048, 8, torch.float16, False),  # Medium input, 8 ranks
 ]
 
 
@@ -429,9 +431,11 @@ def fake_moe(
     return processed_states.view(target_shape)
 
 
-@pytest.mark.parametrize("world_size,num_tokens,vector_dim,top_k,dtype", COMBINE_PARAMS)
+@pytest.mark.parametrize(
+    "world_size,num_tokens,vector_dim,top_k,dtype,payload_in_workspace", COMBINE_PARAMS
+)
 def test_moe_combine_multi_rank_single_gpu(
-    world_size, num_tokens, vector_dim, top_k, dtype
+    world_size, num_tokens, vector_dim, top_k, dtype, payload_in_workspace
 ):
     torch.cuda.set_device(0)
     check_sufficient_sm_count(num_tokens, world_size)
@@ -489,16 +493,27 @@ def test_moe_combine_multi_rank_single_gpu(
 
     inplace_combine_tensors = []
     for rank in range(world_size):
-        inplace_combine_tensors.append(
-            trtllm_moe_alltoall.moe_a2a_wrap_payload_tensor_in_workspace(
-                all_workspaces[rank, :],
-                [world_size, num_tokens],
-                combine_payload_offsets[rank],
-                combine_payload_offsets[rank]
-                + world_size * num_tokens * vector_dim * dtype.itemsize,
-                dtype,
+        if payload_in_workspace:
+            inplace_combine_tensors.append(
+                trtllm_moe_alltoall.moe_a2a_wrap_payload_tensor_in_workspace(
+                    all_workspaces[rank, :],
+                    [world_size, num_tokens],
+                    combine_payload_offsets[rank],
+                    combine_payload_offsets[rank]
+                    + world_size * num_tokens * vector_dim * dtype.itemsize,
+                    dtype,
+                )
+            )
+        else:
+            inplace_combine_tensors.append(
+                torch.empty(
+                    world_size,
+                    num_tokens,
+                    vector_dim,
+                    dtype=dtype,
+                    device=torch.device("cuda"),
+                )
             )
-        )
 
     for rank in range(world_size):
         inplace_combine_tensors[rank].copy_(
@@ -520,7 +535,7 @@ def test_moe_combine_multi_rank_single_gpu(
         metainfo,
         world_size,
         combine_payload_offsets,
-        payload_in_workspace=True,
+        payload_in_workspace=payload_in_workspace,
     )
 
     reference_result = fake_moe(