[https://nvbugs/5680133][fix] Implement customizable router for cutlass MoE.

hyukn · hyukn · commit acd8694b88f8 · 2025-11-28T01:47:42.000Z
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -960,7 +960,8 @@ struct GemmProfilerBackend
         }
     }
 
-    void prepare(int num_tokens, char* workspace, void const* expert_weights, cudaStream_t stream);
+    void prepare(int num_tokens, char* workspace, void const* expert_weights, cudaStream_t stream,
+        void const* token_selected_experts_customized = nullptr, bool use_customized_router = false);
 
     std::map<std::string, std::pair<size_t, size_t>> getProfilerWorkspaces(int maxM, bool is_tma_ws);
     size_t getWorkspaceSize(int maxM);
@@ -990,7 +991,7 @@ struct GemmProfilerBackend
     nvinfer1::DataType mOType{};
 
     // This will be a unique value for every iteration of warmup and actual bench
-    constexpr static int64_t NUM_ROUTING_SAMPLES = 16;
+    constexpr static int64_t NUM_ROUTING_SAMPLES = 1;
 
     constexpr static int64_t NUM_FUSION_TYPES = 2;
     constexpr static int64_t NUM_SWAP_AB_TYPES = 2;
@@ -1006,8 +1007,9 @@ struct GemmProfilerBackend
     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType mScalingType{};
 
 private:
-    void prepareRouting(int num_tokens, char* workspace, cudaStream_t stream);
     void prepareQuantParams(int num_tokens, char* workspace, cudaStream_t stream);
+    void prepareRouting(int num_tokens, char* workspace, cudaStream_t stream,
+        void const* token_selected_experts_customized = nullptr, bool use_customized_router = false);
     void prepareTmaWsInputs(int num_tokens, char* workspace, void const* expert_weights,
         TmaWarpSpecializedGroupedGemmInput::EpilogueFusion fusion, bool swap_ab, cudaStream_t stream);
 };
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -4288,7 +4288,8 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
     return out_map;
 }
 
-void GemmProfilerBackend::prepareRouting(int num_tokens, char* workspace_ptr_char, cudaStream_t stream)
+void GemmProfilerBackend::prepareRouting(int num_tokens, char* workspace_ptr_char, cudaStream_t stream,
+    void const* token_selected_experts_customized, bool use_customized_router)
 {
     auto workspaces = getProfilerWorkspaces(num_tokens, mSM >= 90);
 #define GET_WS_PTR_BASE(type, name)                                                                                    \
@@ -4329,10 +4330,19 @@ void GemmProfilerBackend::prepareRouting(int num_tokens, char* workspace_ptr_cha
         int const start_expert_id = mNumExpertsPerNode * mParallelismConfig.ep_rank;
 
         uint32_t num_threads = 256;
-        dim3 grid_dim{(num_tokens + num_threads - 1) / num_threads, NUM_ROUTING_SAMPLES, 1};
-        prepareFakeRouterBuffers<<<grid_dim, num_threads, 0, stream>>>(
-            token_selected_experts_base, num_tokens, mK, mNumExperts);
-        sync_check_cuda_error(stream);
+        if (use_customized_router)
+        {
+            // copy token selected experts to token_selected_experts_base
+            cudaMemcpyAsync(token_selected_experts_base, token_selected_experts_customized,
+                num_tokens * mK * sizeof(int), cudaMemcpyDeviceToDevice, stream);
+        }
+        else
+        {
+            dim3 grid_dim{(num_tokens + num_threads - 1) / num_threads, NUM_ROUTING_SAMPLES, 1};
+            prepareFakeRouterBuffers<<<grid_dim, num_threads, 0, stream>>>(
+                token_selected_experts_base, num_tokens, mK, mNumExperts);
+            sync_check_cuda_error(stream);
+        }
 
         for (int64_t i = 0; i < NUM_ROUTING_SAMPLES; i++)
         {
@@ -4539,15 +4549,16 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
     }
 }
 
-void GemmProfilerBackend::prepare(
-    int num_tokens, char* workspace_ptr_char, void const* expert_weights, cudaStream_t stream)
+void GemmProfilerBackend::prepare(int num_tokens, char* workspace_ptr_char, void const* expert_weights,
+    cudaStream_t stream, void const* token_selected_experts_customized, bool use_customized_router)
 {
     mSampleIndex = 0;
 
     auto workspace_size = getWorkspaceSize(num_tokens);
     populateRandomBuffer(workspace_ptr_char, workspace_size, stream);
+    auto workspaces = getProfilerWorkspaces(num_tokens, mSM >= 90);
 
-    prepareRouting(num_tokens, workspace_ptr_char, stream);
+    prepareRouting(num_tokens, workspace_ptr_char, stream, token_selected_experts_customized, use_customized_router);
     prepareQuantParams(num_tokens, workspace_ptr_char, stream);
     for (auto fusion : {TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,
              TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE})
diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp
@@ -660,13 +660,13 @@ class FusedMoeRunner : public torch::CustomClassHolder
     }
 
     // TODO Update this to be able to tell if we are profiling swiglu bias
-    void runGemmProfile(torch::Tensor const& input, torch::Tensor const& fc1_expert_weights,
-        torch::optional<torch::Tensor> const& fc1_expert_biases, torch::Tensor const& fc2_expert_weights,
-        torch::optional<torch::Tensor> const& fc2_expert_biases, int64_t const top_k, int64_t const tp_size,
-        int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
-        int64_t const cluster_rank, bool const enable_alltoall, bool const min_latency_mode, int64_t const gemm_idx,
-        int64_t const profile_id, bool const do_preparation, int64_t const activation_type_int,
-        int64_t const unpadded_hidden_size)
+    void runGemmProfile(torch::Tensor const& input, torch::optional<torch::Tensor> const& token_final_scales,
+        torch::Tensor const& fc1_expert_weights, torch::optional<torch::Tensor> const& fc1_expert_biases,
+        torch::Tensor const& fc2_expert_weights, torch::optional<torch::Tensor> const& fc2_expert_biases,
+        int64_t const top_k, int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+        int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall, bool const min_latency_mode,
+        int64_t const gemm_idx, int64_t const profile_id, bool const do_preparation, int64_t const activation_type_int,
+        int64_t const unpadded_hidden_size, bool const use_customized_router)
     {
         std::lock_guard<std::mutex> lock(mMutex);
 
@@ -746,7 +746,10 @@ class FusedMoeRunner : public torch::CustomClassHolder
             auto const cu_malloc_status = cudaMalloc(&mProfileWorkspace, profile_workspace_size);
             TORCH_CHECK(cu_malloc_status == cudaSuccess, "Can't allocate profile workspace for MoE GEMM profile.");
 
-            mProfiler->prepare(num_rows, mProfileWorkspace, expert_weights_ptr, stream);
+            void const* token_selected_experts_customized
+                = token_final_scales.has_value() ? token_final_scales.value().const_data_ptr() : nullptr;
+            mProfiler->prepare(num_rows, mProfileWorkspace, expert_weights_ptr, stream,
+                token_selected_experts_customized, use_customized_router);
         }
 
         // Profile specific tactic. Assuming at least one preparation phase has been executed already.
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -798,7 +798,8 @@ def _profile_runners(
         best_runner_id, best_tactic = None, None
         # If the inputs_pre_hook is provided, it will be called before profiling.
         if tuning_config.inputs_pre_hook is not None:
-            input_tensors = tuning_config.inputs_pre_hook(input_tensors)
+            input_tensors = tuning_config.inputs_pre_hook(
+                input_tensors, **kwargs)
         for runner_id, runner in enumerate(runners):
             # TODO: use FakeTensor here.
             runner_arg_names = {
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -440,8 +440,8 @@ def generate_permuted_idx_to_expanded_idx(
                         permuted_idx_to_expanded_idx.append(self.pad_val)
             return permuted_idx_to_expanded_idx
 
-        def inputs_pre_hook(self,
-                            inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        def inputs_pre_hook(self, inputs: List[torch.Tensor],
+                            **kwargs) -> List[torch.Tensor]:
             a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles, *others = inputs
             num_tokens = self.infer_num_tokens(a.size(0))
             num_tokens_per_expert = self.generate_num_tokens_per_expert(
@@ -465,8 +465,8 @@ def inputs_pre_hook(self,
                 device=num_non_exiting_tiles.device)
             return a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles, *others
 
-        def inputs_pre_hook_finalize_fusion(
-                self, inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        def inputs_pre_hook_finalize_fusion(self, inputs: List[torch.Tensor],
+                                            **kwargs) -> List[torch.Tensor]:
             a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, tile_idx_to_mn_limit, permuted_idx_to_expanded_idx, num_non_exiting_tiles, token_final_scales = inputs
             num_tokens = self.infer_num_tokens(a.size(0))
             num_tokens_per_expert = self.generate_num_tokens_per_expert(
@@ -1414,8 +1414,8 @@ def __init__(self, num_experts: int, top_k: int, num_local_experts: int,
         def infer_shape_num_tokens(self, input_shapes: List[torch.Size]) -> int:
             return input_shapes[0][0]
 
-        def inputs_pre_hook(self,
-                            inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        def inputs_pre_hook(self, inputs: List[torch.Tensor],
+                            **kwargs) -> List[torch.Tensor]:
             x, x_sf, token_selected_experts, token_final_scales, *others = inputs
             num_tokens = token_selected_experts.size(0)
             new_token_final_scales, new_token_selected_experts = torch.randn(
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -14,6 +14,7 @@
 from ..modules.multi_stream_utils import do_multi_stream
 from ..modules.swiglu import silu_and_mul_kernel
 from ..utils import (ActivationType, fp4_scale_infer_shape,
+                     gen_balanced_moe_routing_input,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
 
@@ -24,6 +25,18 @@ def bmm_out(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
     torch.bmm(a, b, out=out)
 
 
+def inputs_pre_hook(inputs: List[torch.Tensor], ep_size: int,
+                    **kwargs) -> List[torch.Tensor]:
+    x, token_selected_experts, fc1_expert_weights, fc1_expert_biases, fc2_expert_weights, fc2_expert_biases = inputs
+    num_tokens = x.shape[0]
+    num_experts = fc2_expert_weights.shape[0] * ep_size
+    top_k = token_selected_experts.shape[1]
+    router = gen_balanced_moe_routing_input(num_tokens, num_experts, top_k)
+    inputs[1] = router.to(dtype=torch.int32,
+                          device=token_selected_experts.device)
+    return inputs
+
+
 class MoERunner(TunableRunner):
     # avoid overhead of creating a new runner in forward pass
     runner_dict = dict()
@@ -32,6 +45,7 @@ class MoERunner(TunableRunner):
             0, 0, get_last_power_of_2_num_tokens_buckets,
             last_positive_power_of_2), ),
         tune_max_num_tokens=8192,
+        inputs_pre_hook=inputs_pre_hook,
     )
 
     def __init__(
@@ -99,10 +113,13 @@ def forward(
         gemm_idx: int = 0,
         tactic: int = -1,
         do_preparation: bool = False,
+        **kwargs,
     ):
-        x, fc1_expert_weights, fc1_expert_biases, fc2_expert_weights, fc2_expert_biases = inputs
+        x, token_selected_experts, fc1_expert_weights, fc1_expert_biases, fc2_expert_weights, fc2_expert_biases = inputs
+        use_customized_router = True
         self.fused_moe_runner.run_gemm_profile(
             x,
+            token_selected_experts,
             fc1_expert_weights,
             fc1_expert_biases,
             fc2_expert_weights,
@@ -121,6 +138,7 @@ def forward(
             do_preparation,
             self.activation_type,
             self.unpadded_hidden_size,
+            use_customized_router,
         )
 
 
@@ -197,27 +215,30 @@ def fused_moe(
     )
 
     MoERunner.tuning_config.tune_max_num_tokens = tune_max_num_tokens
-
+    input_tensors = [
+        tuner_input,
+        token_selected_experts,
+        fc1_expert_weights,
+        fc1_expert_biases,
+        fc2_expert_weights,
+        fc2_expert_biases,
+    ]
     _, gemm_tactic_1 = tuner.choose_one(
         "trtllm::fused_moe::gemm1",
         [moe_runner],
         MoERunner.tuning_config,
-        [
-            tuner_input, fc1_expert_weights, fc1_expert_biases,
-            fc2_expert_weights, fc2_expert_biases
-        ],
+        input_tensors,
         gemm_idx=1,
+        ep_size=ep_size,
     )
 
     _, gemm_tactic_2 = tuner.choose_one(
         "trtllm::fused_moe::gemm2",
         [moe_runner],
         MoERunner.tuning_config,
-        [
-            tuner_input, fc1_expert_weights, fc1_expert_biases,
-            fc2_expert_weights, fc2_expert_biases
-        ],
+        input_tensors,
         gemm_idx=2,
+        ep_size=ep_size,
     )
 
     run_moe = moe_runner.fused_moe_runner.run_moe_min_latency if min_latency_mode else moe_runner.fused_moe_runner.run_moe
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -79,8 +79,8 @@ def prepare_dummy_topk_and_hook(
             routing_logits_for_tuner = routing_logits
 
     # Define hook to recreate dummy tensors when shape changes during profiling
-    def recreate_dummy_topk_if_needed(
-            inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+    def recreate_dummy_topk_if_needed(inputs: List[torch.Tensor],
+                                      **kwargs) -> List[torch.Tensor]:
         """Recreate dummy topk tensors if token count changed during profiling."""
         current_num_tokens = inputs[hidden_states_index].shape[0]
 
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -370,3 +370,27 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator(func) if func else decorator
+
+
+def gen_balanced_moe_routing_input(num_tokens: int, num_experts: int,
+                                   top_k: int) -> torch.Tensor:
+    """
+    Generate imbalanced routing input for MoE routing.
+    """
+    token_selected_experts_gen = torch.zeros(num_tokens, top_k)
+    # select k unique experts from num_experts for each token
+    for i in range(num_tokens):
+        token_selected_experts_gen[i] = torch.randperm(num_experts)[:top_k]
+    return token_selected_experts_gen
+
+
+def gen_imbalanced_moe_routing_input(num_tokens: int, num_experts: int,
+                                     top_k: int) -> torch.Tensor:
+    """
+    Generate imbalanced routing input for MoE routing.
+    """
+    token_selected_experts_gen = torch.zeros(num_tokens, top_k)
+    # select k unique experts from num_experts for each token
+    for i in range(num_tokens):
+        token_selected_experts_gen[i] = torch.arange(0, top_k)
+    return token_selected_experts_gen
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
@@ -363,7 +363,7 @@ def forward(
         return [gemm_0, gemm_1, gemm_fallback][tactic_id](*inputs)
 
     @staticmethod
-    def inputs_pre_hook(inputs: List[torch.Tensor]):
+    def inputs_pre_hook(inputs: List[torch.Tensor], **kwargs):
         # always set the first element to bo iota in x
         x, w = inputs
         x_hooked = torch.zeros_like(x)