diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py index ca71b478ee49..b77014e552f7 100644 --- a/paddlenlp/transformers/deepseek_v2/modeling.py +++ b/paddlenlp/transformers/deepseek_v2/modeling.py @@ -741,6 +741,8 @@ def forward(self, hidden_states): return scores, routing_map, l_aux, l_zloss capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss = self.topkgating(scores) + dispatch_mask.stop_gradient = True + exp_counts.stop_gradient = True return capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss