From 3a614fc167928edfff835e2d35d619cac87d90bc Mon Sep 17 00:00:00 2001 From: Liang Shuhao Date: Mon, 3 Nov 2025 12:34:40 +0000 Subject: [PATCH] Normalize gates on expert dim before calculating seq_aux_loss --- paddlenlp/transformers/moe_gate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlenlp/transformers/moe_gate.py b/paddlenlp/transformers/moe_gate.py index 5507702fdcd8..f8b6695832f6 100644 --- a/paddlenlp/transformers/moe_gate.py +++ b/paddlenlp/transformers/moe_gate.py @@ -140,6 +140,8 @@ def _cal_seq_aux_loss(self, gates, top_k, topk_idx) -> paddle.Tensor: paddle.Tensor: The value of sequence auxiliary loss. """ batch_size, seq_len, _ = gates.shape + gates = gates / (gates.sum(axis=-1, keepdim=True) + 1e-20) + _, topk_idx = paddle.topk(gates, top_k, axis=-1) ce = paddle.zeros([batch_size, self.num_experts]) topk_idx = topk_idx.reshape([batch_size, -1]) ce.put_along_axis_(indices=topk_idx, values=paddle.ones([batch_size, seq_len * top_k]), axis=1, reduce="add")