From 830021b67ac412f01eefc6839df56ae306e46790 Mon Sep 17 00:00:00 2001
From: zrr1999 <2742392377@qq.com>
Date: Fri, 7 Nov 2025 08:59:53 +0000
Subject: [PATCH 1/2] use auto in intermediate expr

---
 .../cpu/add_position_encoding_kernel.cc       |   5 +-
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/box_coder_kernel.cc    |   4 +-
 .../cpu/broadcast_tensors_grad_kernel.cc      |   4 +-
 paddle/phi/kernels/cpu/conv_util.h            |   5 +-
 .../kernels/cpu/cross_entropy_grad_kernel.cc  |   8 +-
 .../cpu/distribute_fpn_proposals_kernel.cc    |   2 +-
 .../cpu/lookup_table_dequant_kernel.cc        |   2 +-
 paddle/phi/kernels/cpu/lrn_kernel.cc          |   2 +-
 .../phi/kernels/cpu/matrix_rank_tol_kernel.cc |   2 +-
 .../phi/kernels/cpu/psroi_pool_grad_kernel.cc |   8 +-
 paddle/phi/kernels/cpu/psroi_pool_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/rnn_functor.h          |   6 +-
 paddle/phi/kernels/cpu/rnn_grad_kernel.cc     |   2 +-
 .../phi/kernels/cpu/roi_align_grad_kernel.cc  |   2 +-
 .../phi/kernels/cpu/roi_pool_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/roi_pool_kernel.cc     |   5 +-
 .../cpu/sequence_expand_grad_kernel.cc        |   2 +-
 paddle/phi/kernels/cpu/svd_kernel.cc          |   4 +-
 paddle/phi/kernels/cpu/unpool_grad_kernel.cc  |   4 +-
 paddle/phi/kernels/cpu/unpool_kernel.cc       |   4 +-
 .../phi/kernels/cpu/viterbi_decode_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/yolo_loss_kernel.cc    |   4 +-
 paddle/phi/kernels/funcs/aligned_vector.h     |   2 +-
 paddle/phi/kernels/funcs/blas/blas_impl.h     |  26 +-
 paddle/phi/kernels/funcs/block_radix_topk.cuh |   6 +-
 paddle/phi/kernels/funcs/broadcast_function.h |   4 +-
 .../phi/kernels/funcs/correlation_funcs.cu.h  |   4 +-
 paddle/phi/kernels/funcs/correlation_funcs.h  |   4 +-
 .../kernels/funcs/deformable_conv_functor.cc  |   5 +-
 .../phi/kernels/funcs/detail/gru_gpu_kernel.h |   4 +-
 .../kernels/funcs/detection/bbox_util.cu.h    |   2 +-
 .../elementwise/elementwise_op_function.h     |  14 +-
 paddle/phi/kernels/funcs/fc_functor.cu        |  12 +-
 paddle/phi/kernels/funcs/im2col.cc            |  25 +-
 paddle/phi/kernels/funcs/im2col.cu            |  69 +--
 paddle/phi/kernels/funcs/im2col_cfo_cpu.h     |  20 +-
 paddle/phi/kernels/funcs/index_put_utils.h    |   2 +-
 paddle/phi/kernels/funcs/jit/gen/seqpool.cc   |   3 +-
 paddle/phi/kernels/funcs/jit/gen_base.cc      |   3 +-
 .../funcs/jit/more/intrinsic/crf_decoding.cc  |   2 +-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |  19 +-
 paddle/phi/kernels/funcs/math/beam_search.cu  |   7 +-
 .../phi/kernels/funcs/math/context_project.h  |   8 +-
 paddle/phi/kernels/funcs/math/tree2col.cu     |   3 +-
 paddle/phi/kernels/funcs/math/unpooling.cc    |   8 +-
 paddle/phi/kernels/funcs/matrix_inverse.cu    |   2 +-
 paddle/phi/kernels/funcs/matrix_solve.h       |   4 +-
 paddle/phi/kernels/funcs/maxouting.cc         |   2 +-
 .../kernels/funcs/multi_tensor_apply_util.h   |   8 +-
 .../kernels/funcs/multihead_matmul_functor.cu |   6 +-
 paddle/phi/kernels/funcs/norm_utils.cu.h      |   3 +-
 paddle/phi/kernels/funcs/pooling.cu           |   4 +-
 paddle/phi/kernels/funcs/sequence_padding.cc  |   8 +-
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  |   7 +-
 paddle/phi/kernels/funcs/stack_functor.h      |   8 +-
 .../phi/kernels/funcs/sync_batch_norm_utils.h |  15 +-
 .../phi/kernels/funcs/top_k_function_cuda.h   |   2 +-
 .../phi/kernels/funcs/transpose_function.cu.h |  17 +-
 paddle/phi/kernels/funcs/unsqueeze.h          |   6 +-
 paddle/phi/kernels/funcs/vol2col.cc           |  14 +-
 paddle/phi/kernels/funcs/vol2col.cu           |  38 +-
 .../kernels/funcs/weight_dequant_functor.h    |  12 +-
 paddle/phi/kernels/funcs/weight_only_gemv.cu  |  10 +-
 .../cpu/fused_embedding_fc_lstm_kernel.cc     |   3 +-
 .../kernels/fusion/cpu/fusion_gru_kernel.cc   |   2 +-
 .../kernels/fusion/cpu/fusion_lstm_kernel.cc  |   5 +-
 .../cpu/fusion_seqconv_eltadd_relu_kernel.cc  |   2 +-
 .../fusion/cpu/self_dp_attention_kernel.cc    |  10 +-
 .../fusion/cutlass/conv2d/conv2d_util.cu      |  18 +-
 .../threadblock/epilogue_tensor_op_int32.h    |  16 +-
 .../gemm/kernel/fpA_intB_gemm.h               |   5 +-
 .../gemm/kernel/fpA_intB_gemm_split_k.h       |   2 +-
 .../warp/mma_tensorop_compute_B_with_f16.h    |   4 +-
 .../gemm/warp/mma_tensorop_dequantizer.h      |   3 +-
 .../fpA_intB_gemm/fpA_intB_gemm_template.h    |   9 +-
 .../epilogue/epilogue_pipelined.h             |   8 +-
 .../gemm/attention_scaling_coefs_updater.h    |  40 +-
 .../gemm/mma_accum_lambda_iterator.h          |  38 +-
 .../gemm/mma_from_smem.h                      |  26 +-
 .../epilogue_predicated_tile_iterator.h       |  51 +--
 ...cated_tile_access_iterator_residual_last.h |   2 +-
 .../predicated_tile_iterator_residual_last.h  |  16 +-
 .../iterators/warp_iterator_from_smem.h       |   2 +-
 .../kernel_backward.h                         |  10 +-
 paddle/phi/kernels/fusion/gpu/block_attn.h    | 401 ++++++++++--------
 .../gpu/block_multi_head_attention_kernel.cu  |   3 +-
 paddle/phi/kernels/fusion/gpu/fmha_ref.h      |  12 +-
 .../gpu/fused_gate_attention_grad_kernel.cu   |  12 +-
 .../fusion/gpu/fused_gate_attention_kernel.cu |  12 +-
 .../fused_layernorm_residual_dropout_bias.h   |  10 +-
 .../gpu/fused_multi_transformer_kernel.cu     |   6 +-
 .../gpu/fused_multi_transformer_op.cu.h       | 212 +++++----
 .../gpu/fused_seqpool_cvm_grad_kernel.cu      |   4 +-
 .../fusion/gpu/fused_seqpool_cvm_kernel.cu    |   2 +-
 .../gpu/fused_softmax_mask_grad_kernel.cu     |   6 +-
 .../fusion/gpu/fused_softmax_mask_kernel.cu   |  19 +-
 ...softmax_mask_upper_triangle_grad_kernel.cu |   2 +-
 ...used_softmax_mask_upper_triangle_kernel.cu |   2 +-
 .../fused_weighted_swiglu_act_quant_kernel.cu |   2 +-
 .../gpu/masked_multihead_attention_kernel.cu  |  70 +--
 .../fusion/gpu/multihead_matmul_kernel.cu     |  22 +-
 .../fusion/gpu/qkv_unpack_mha_kernel.cu       |  14 +-
 .../fusion/onednn/fusion_gru_kernel.cc        |   6 +-
 .../kernels/fusion/onednn/fusion_rnn_onednn.h |   6 +-
 .../xpu/block_multi_head_attention_kernel.cc  |   5 +-
 .../embedding_with_eltwise_add_xpu_kernel.cc  |   4 +-
 .../kernels/gpu/affine_channel_grad_kernel.cu |   7 +-
 .../gpu/broadcast_tensors_grad_kernel.cu      |   4 +-
 .../kernels/gpu/class_center_sample_kernel.cu |   2 +-
 .../kernels/gpu/correlation_grad_kernel.cu    |   8 +-
 paddle/phi/kernels/gpu/correlation_kernel.cu  |   6 +-
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |   6 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/depthwise_conv.h       | 227 +++++-----
 paddle/phi/kernels/gpu/determinant_kernel.cu  |   2 +-
 .../gpu/distribute_fpn_proposals_kernel.cu    |   2 +-
 .../phi/kernels/gpu/edit_distance_kernel.cu   |  12 +-
 paddle/phi/kernels/gpu/elementwise_grad.h     |   2 +-
 .../phi/kernels/gpu/flash_attn_v3_kernel.cu   |  22 +-
 paddle/phi/kernels/gpu/flash_attn_v3_utils.cu |   4 +-
 .../kernels/gpu/generate_proposals_kernel.cu  |   2 +-
 .../phi/kernels/gpu/global_gather_kernel.cu   |   2 +-
 .../phi/kernels/gpu/global_scatter_kernel.cu  |   2 +-
 .../phi/kernels/gpu/group_norm_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/group_norm_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/instance_norm_utils.h  |   3 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/lrn_grad_kernel.cu     |  10 +-
 paddle/phi/kernels/gpu/lrn_kernel.cu          |   9 +-
 .../phi/kernels/gpu/multiclass_nms3_kernel.cu |  15 +-
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/norm_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/prior_box_kernel.cu    |   8 +-
 .../phi/kernels/gpu/psroi_pool_grad_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/psroi_pool_kernel.cu   |   6 +-
 .../kernels/gpu/repeat_interleave_kernel.cu   |   2 +-
 .../phi/kernels/gpu/rms_norm_grad_kernel.cu   |  13 +-
 paddle/phi/kernels/gpu/roll_grad_kernel.cu    |   2 +-
 paddle/phi/kernels/gpu/roll_kernel.cu         |   2 +-
 .../phi/kernels/gpu/row_conv_grad_kernel.cu   |   4 +-
 .../gpu/sequence_expand_grad_kernel.cu        |   2 +-
 .../phi/kernels/gpu/sequence_expand_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/shuffle_channel.h      |   6 +-
 .../gpu/shuffle_channel_grad_kernel.cu        |   2 +-
 .../phi/kernels/gpu/shuffle_channel_kernel.cu |   2 +-
 .../phi/kernels/gpu/slogdeterminant_kernel.cu |   2 +-
 .../phi/kernels/gpu/top_p_sampling_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/tril_indices_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/unpool_grad_kernel.cu  |   4 +-
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  |   2 +-
 .../gpu/weighted_sample_neighbors_kernel.cu   |  10 +-
 .../phi/kernels/gpu/yolo_box_head_kernel.cu   |   5 +-
 .../phi/kernels/gpu/yolo_box_post_kernel.cu   |  10 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |   8 +-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |   4 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      |   8 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |   4 +-
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    |   4 +-
 .../impl/anchor_generator_kernel_impl.h       |   2 +-
 .../impl/broadcast_tensors_kernel_impl.h      |   4 +-
 .../kernels/impl/cholesky_grad_kernel_impl.h  |  27 +-
 .../impl/collect_fpn_proposals_kernel_impl.h  |   7 +-
 paddle/phi/kernels/impl/diag_embed_impl.h     |   4 +-
 .../phi/kernels/impl/fold_grad_kernel_impl.h  |  14 +-
 paddle/phi/kernels/impl/fold_kernel_impl.h    |  14 +-
 .../kernels/impl/im2sequence_kernel_impl.h    |   5 +-
 .../impl/llm_int8_matmul_kernel_impl.h        |  18 +-
 .../kernels/impl/matrix_power_kernel_impl.h   |   3 +-
 .../kernels/impl/unstack_grad_kernel_impl.h   |   2 +-
 .../impl/weight_quantize_kernel_gpu_impl.h    |   4 +-
 .../impl/weight_quantize_kernel_impl.h        |  10 +-
 paddle/phi/kernels/onednn/concat_kernel.cc    |   2 +-
 .../phi/kernels/onednn/matmul_grad_kernel.cc  |   4 +-
 paddle/phi/kernels/onednn/multi_gru_kernel.cc |  12 +-
 .../phi/kernels/onednn/reduce_kernel_impl.h   |   2 +-
 .../kernels/primitive/compute_primitives.h    |   4 +-
 .../primitive/datamover_primitives_xpu2.h     |  10 +-
 .../stride/reduce_grad_stride_kernel.cu       |   2 +-
 paddle/phi/kernels/strings/gpu/copy_utils.h   |   2 +-
 180 files changed, 1217 insertions(+), 1024 deletions(-)
 mode change 100755 => 100644 paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc

diff --git a/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc b/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc
index 4b1dbee20c6aec..c023ea2a82345d 100644
--- a/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc
+++ b/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc
@@ -76,8 +76,9 @@ void AddPositionEncodingKernel(const Context& dev_ctx,
 
   const int half_size = enc_size / 2;
   for (int i = 0; i < batch_size; ++i) {
-    const int max_length =
-        x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
+    const auto max_length(x_lod.empty() ? max_seq_len
+                                        : x_lod[0][i + 1] - x_lod[0][i]);
+
     for (int j = 0; j < max_length; ++j) {
       for (int k = 0; k < half_size; ++k) {
         const double val =
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index ecc3cc4df61b13..1d00c63ab76599 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -181,7 +181,7 @@ void BatchNormGradFunctor(const Context& dev_ctx,
     bias_arr.setZero();
   }
 
-  int scale_coeff = use_global_stats ? 1 : N * sample_size;
+  auto scale_coeff = use_global_stats ? 1 : N * sample_size;
   const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coeff;
 
   DenseTensor dy_sum;
diff --git a/paddle/phi/kernels/cpu/box_coder_kernel.cc b/paddle/phi/kernels/cpu/box_coder_kernel.cc
index 6a0998a3bfd088..cdb60de18e9642 100644
--- a/paddle/phi/kernels/cpu/box_coder_kernel.cc
+++ b/paddle/phi/kernels/cpu/box_coder_kernel.cc
@@ -120,7 +120,7 @@ void DecodeCenterSize(const DenseTensor *target_box,
       std::array<T, 4> var_data{1., 1., 1., 1.};
       T *var_ptr = var_data.data();
       size_t offset = i * col * len + j * len;
-      int prior_box_offset = axis == 0 ? j * len : i * len;
+      auto prior_box_offset = axis == 0 ? j * len : i * len;
 
       T prior_box_width = prior_box_data[prior_box_offset + 2] -
                           prior_box_data[prior_box_offset] +
@@ -135,7 +135,7 @@ void DecodeCenterSize(const DenseTensor *target_box,
 
       T target_box_center_x = 0, target_box_center_y = 0;
       T target_box_width = 0, target_box_height = 0;
-      int prior_var_offset = axis == 0 ? j * len : i * len;
+      auto prior_var_offset = axis == 0 ? j * len : i * len;
       if (var_size == 2) {
         std::memcpy(var_ptr,
                     prior_box_var->data<T>() + prior_var_offset,
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 40964b6b447c42..31880d0160094d 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -112,8 +112,8 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx,
     std::vector<int> reduce_dims_vec;
     std::vector<int> reshape_dims_vec;
     for (int j = 0; j < in_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
+      auto out_axis = out_rank - j - 1;
+      auto in_axis = in_rank - j - 1;
 
       reshape_dims_vec.push_back(static_cast<int>(input_dims[j]));
       if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index af17fb06c6ec90..df497513d04dec 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -77,8 +77,9 @@ inline int ConvOutSize(int input_size,
                        int pad_left,
                        int pad_right,
                        int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size =
+  const auto dkernel(dilation * (filter_size - 1) + 1);
+
+  auto output_size =
       (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
 
   PADDLE_ENFORCE_GT(
diff --git a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
index f9b3daee2571a4..b2880a1ce33b9f 100644
--- a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
@@ -95,8 +95,8 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx,
       const int remain = d / axis_dim;
       for (int i = 0; i < n; ++i) {         // for each sample_1_dim
         for (int j = 0; j < remain; j++) {  // for each sample_other_dims
-          int idx = i * remain + j;  // this sample's label_idx. for 1d case,
-                                     // remain=1 and j=0, so, idx = i
+          auto idx = i * remain + j;  // this sample's label_idx. for 1d case,
+                                      // remain=1 and j=0, so, idx = i
           auto lbl = static_cast<int64_t>(label_data[idx]);  // NOLINT
           if (lbl == ignore_index) {
             for (int k = 0; k < axis_dim; ++k) {  // for each class id's label
@@ -147,8 +147,8 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx,
     const int remain = d / axis_dim;
     for (int i = 0; i < n; ++i) {         // for each sample_1_dim
       for (int j = 0; j < remain; j++) {  // for each sample_other_dims
-        int idx = i * remain + j;  // this sample's label_idx. for 1d case,
-                                   // remain=1 and j=0, so, idx = i
+        auto idx = i * remain + j;  // this sample's label_idx. for 1d case,
+                                    // remain=1 and j=0, so, idx = i
         auto lbl = static_cast<int64_t>(label_data[idx]);  // NOLINT
         if (lbl == ignore_index) {
           for (int k = 0; k < axis_dim; ++k) {  // for each class id's label
diff --git a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
index c1c13e1539bdb9..8d83872f1768b5 100644
--- a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
@@ -33,7 +33,7 @@ void DistributeFpnProposalsKernel(
     std::vector<DenseTensor*> multi_fpn_rois,
     std::vector<DenseTensor*> multi_level_rois_num,
     DenseTensor* restore_index) {
-  const int num_level = max_level - min_level + 1;
+  const auto num_level(max_level - min_level + 1);
 
   // check that the fpn_rois is not empty
   if (!rois_num.get_ptr()) {
diff --git a/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc b/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc
index 03f1ecaf162ee1..48d48a6ae4736a 100644
--- a/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc
+++ b/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc
@@ -82,7 +82,7 @@ void LookupTableDequantKernel(const Context &dev_ctx,
               ids[i]));
       float min = *(table + ids[i] * quant_number);
       float max = *(table + ids[i] * quant_number + 1);
-      int offset = ids[i] * quant_number + 2;
+      auto offset = ids[i] * quant_number + 2;
       const unsigned char *tensor_buf =
           reinterpret_cast<const unsigned char *>(table + offset);
       dequant(
diff --git a/paddle/phi/kernels/cpu/lrn_kernel.cc b/paddle/phi/kernels/cpu/lrn_kernel.cc
index d4dfcacdd6a2ca..6efe58243a447d 100644
--- a/paddle/phi/kernels/cpu/lrn_kernel.cc
+++ b/paddle/phi/kernels/cpu/lrn_kernel.cc
@@ -91,7 +91,7 @@ struct LRNFunctor<phi::CPUContext, T> {
       }
       for (int c = 1; c < C; ++c) {
         // copy previous scale
-        int mid_offset = i * fea_size + c * img_size;
+        auto mid_offset = i * fea_size + c * img_size;
         std::memcpy(mdata + mid_offset,
                     mdata + mid_offset - img_size,
                     img_size * sizeof(T));
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 56c2459f61e43b..0f08c0a1c7a3c4 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -42,7 +42,7 @@ void LapackSVD(const T* x_data,
   int mn = std::min(rows, cols);
   T* a = const_cast<T*>(x_data);  // NOLINT
   int lda = rows;
-  int lwork = 3 * mn + std::max(mx, 7 * mn);
+  auto lwork = 3 * mn + std::max(mx, 7 * mn);
   std::vector<phi::dtype::Real<T>> rwork(
       std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn));
   std::vector<T> work(lwork);
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index 4f9cc16890ea79..5959dda73eaaca 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -79,12 +79,12 @@ void PsroiPoolGradKernel(const Context& dev_ctx,
       int pw = i % pooled_width;
       int ph = (i / pooled_width) % pooled_height;
       int c = (i / pooled_width / pooled_height) % output_channels;
-      int n = i / pooled_width / pooled_height / output_channels;
+      auto n = i / pooled_width / pooled_height / output_channels;
 
       // set roi_batch_id
       int roi_batch_id = rois_batch_id_data[n];
-      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-      int input_offset =
+      auto input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      auto input_offset =
           (roi_batch_id * input_channels + input_channel) * height * width;
       T* offset_dx_data = dx_data + input_offset;
 
@@ -124,7 +124,7 @@ void PsroiPoolGradKernel(const Context& dev_ctx,
       T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
       for (int ih = hstart; ih < hend; ++ih) {
         for (int iw = wstart; iw < wend; ++iw) {
-          int input_index = ih * width + iw;
+          auto input_index = ih * width + iw;
           offset_dx_data[input_index] += diff_val;
         }
       }
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
index db16aa3a541cd0..56241730d39ce1 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -148,7 +148,7 @@ void PsroiPoolKernel(const Context& dev_ctx,
           wend = std::min(std::max(wend, 0), width);
 
           int output_index = out_row_offset + pw;
-          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          auto input_channel = (c * pooled_height + ph) * pooled_width + pw;
           int input_plane_offset = static_cast<int>(
               roi_batch_id * in_stride[0] + input_channel * in_stride[1]);
           const T* offset_input_data = input_data + input_plane_offset;
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index d7c1df8a0bb615..538ab4f125f1fa 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -99,7 +99,7 @@ void ResetParameterVector(const std::vector<TensorType>& raw_params_vec,
     for (int j = 0; j < layer_weight_size; j++) {
       int k = j % 4;
       const int& section = j / 4;
-      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      auto tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
       if (k >= 2) {
         tensor_idx += bias_start_idx;
       }
@@ -217,8 +217,8 @@ void AllocateReserveData(const Context& dev_ctx,
   int direction_num = is_bidirec ? 2 : 1;
   int time_step = input->dims()[0];
   int batch_size = input->dims()[1];
-  int block_size = direction_num * time_step * batch_size * hidden_size;
-  int hidden_data_idx = (num_layers - 1);
+  auto block_size = direction_num * time_step * batch_size * hidden_size;
+  auto hidden_data_idx = (num_layers - 1);
   if (is_lstm(mode)) {
     hidden_data_idx += (gate_num + 2) * num_layers;
   } else if (is_gru(mode)) {
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index de355c643b1d9a..5d25d77ebd68aa 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -384,7 +384,7 @@ struct GradLayer {
       const std::string& mode) {
     int direction_num = is_bidirec ? 2 : 1;
     int current_reverse_idx = is_reverse ? 1 : 0;
-    int current_layer_idx = direction_num * layer_idx + current_reverse_idx;
+    auto current_layer_idx = direction_num * layer_idx + current_reverse_idx;
     int begin_idx = 0;
     if (is_reverse) {
       begin_idx = time_step;
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
index 00bf2968b0fd5c..4b60feb03ad2f2 100644
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -175,7 +175,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
           out_grad_data + n * out_stride[0] + c * out_stride[1];
       for (int ph = 0; ph < pooled_height; ++ph) {
         for (int pw = 0; pw < pooled_width; ++pw) {
-          int pool_index = ph * pooled_width + pw;
+          auto pool_index = ph * pooled_width + pw;
           T out_grad_this_bin = batch_out_grad_data[pool_index];
           int roi_bin_grid_h = (sampling_ratio > 0)
                                    ? sampling_ratio
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
index 465412b40074a9..5c5405d5539308 100644
--- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -86,7 +86,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
       for (int c = 0; c < channels; ++c) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
+            auto pool_index = ph * pooled_width + pw;
             if (arg_max_data[pool_index] >= 0) {
               auto index = arg_max_data[pool_index];
               batch_grad_data[index] += out_grad_data[pool_index];
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
index bdef9c8ec6e840..299a4566fcdd63 100644
--- a/paddle/phi/kernels/cpu/roi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -135,7 +135,7 @@ void RoiPoolKernel(const Context& dev_ctx,
           wstart = std::min(std::max(wstart + box_start_w, 0), width);
           wend = std::min(std::max(wend + box_start_w, 0), width);
 
-          const int pool_index = ph * pooled_width + pw;
+          const auto pool_index(ph * pooled_width + pw);
 
           // Define an empty pooling region to be zero
           bool is_empty = (hend <= hstart) || (wend <= wstart);
@@ -145,7 +145,8 @@ void RoiPoolKernel(const Context& dev_ctx,
 
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              const int index = h * width + w;
+              const auto index(h * width + w);
+
               if (batch_data[index] > output_data[pool_index]) {
                 output_data[pool_index] = batch_data[index];
                 arg_max_data[pool_index] = index;
diff --git a/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc b/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc
index c1d3356935accd..16fc4232870be9 100644
--- a/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc
@@ -46,7 +46,7 @@ struct SequenceExpandGradFunctor<phi::CPUContext, T> {
         if (x_seq_len == 0) continue;
         auto dx_sub = dx->Slice(x_start, x_end);
         dx_sub.Resize(common::flatten_to_1d(dx_sub.dims()));
-        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_end = dout_offset + repeat_num * x_seq_len;
         auto dout_sub = dout.Slice(dout_offset, dout_end);
         dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
         phi::funcs::ColwiseSum<phi::CPUContext, T> col_sum;
diff --git a/paddle/phi/kernels/cpu/svd_kernel.cc b/paddle/phi/kernels/cpu/svd_kernel.cc
index a88e8c98854d9a..0fe409883b3c5b 100644
--- a/paddle/phi/kernels/cpu/svd_kernel.cc
+++ b/paddle/phi/kernels/cpu/svd_kernel.cc
@@ -82,8 +82,8 @@ void BatchSvd(const T* X,
   // NOTE: this function is row major, because this function called the lapack.
   int stride = rows * cols;
   int k = std::min(rows, cols);
-  int stride_u = full ? rows * rows : k * rows;
-  int stride_v = full ? cols * cols : k * cols;
+  auto stride_u = full ? rows * rows : k * rows;
+  auto stride_v = full ? cols * cols : k * cols;
   for (int i = 0; i < batches; ++i) {
     LapackSvd<T>(X + i * stride,
                  U + i * stride_u,
diff --git a/paddle/phi/kernels/cpu/unpool_grad_kernel.cc b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc
index afb2dfdcb095c9..960b2da133df9f 100644
--- a/paddle/phi/kernels/cpu/unpool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc
@@ -113,8 +113,8 @@ void Unpool3dGrad(const Context& dev_ctx,
   const int output_depth = static_cast<int>(out.dims()[2]);
   const int output_height = static_cast<int>(out.dims()[3]);
   const int output_width = static_cast<int>(out.dims()[4]);
-  int input_feasize = input_depth * input_height * input_width;
-  int output_feasize = output_depth * output_height * output_width;
+  auto input_feasize = input_depth * input_height * input_width;
+  auto output_feasize = output_depth * output_height * output_width;
   const IndT* indices_data = indices.data<IndT>();
 
   for (int b = 0; b < batch_size; ++b) {
diff --git a/paddle/phi/kernels/cpu/unpool_kernel.cc b/paddle/phi/kernels/cpu/unpool_kernel.cc
index 965698dd8cdd08..e9048e43389985 100644
--- a/paddle/phi/kernels/cpu/unpool_kernel.cc
+++ b/paddle/phi/kernels/cpu/unpool_kernel.cc
@@ -107,8 +107,8 @@ void Unpool3d(const Context& dev_ctx,
   const int output_depth = static_cast<int>(out->dims()[2]);
   const int output_height = static_cast<int>(out->dims()[3]);
   const int output_width = static_cast<int>(out->dims()[4]);
-  int input_feasize = input_depth * input_height * input_width;
-  int output_feasize = output_depth * output_height * output_width;
+  auto input_feasize = input_depth * input_height * input_width;
+  auto output_feasize = output_depth * output_height * output_width;
   const T* input_data = x.data<T>();
   const IndT* indices_data = indices.data<IndT>();
   for (int b = 0; b < batch_size; ++b) {
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
index fad1b2ec2b2663..c215b6af5d596e 100644
--- a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -168,7 +168,7 @@ void ViterbiDecodeKernel(const Context& dev_ctx,
   std::vector<DenseTensor> historys;
   // We create tensor buffer in order to avoid allocating memory frequently
   // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  auto buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
   DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
   funcs::TensorBuffer int_tensor_buffer(int_buffer);
   // create float tensor buffer
diff --git a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
index 96c38a7f1560d0..ee00050b57575d 100644
--- a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
@@ -282,7 +282,7 @@ void YoloLossKernel(const Context& dev_ctx,
           // If best IoU is bigger then ignore_thresh,
           // ignore the objectness loss.
           if (best_iou > ignore_thresh) {
-            int obj_idx = (i * mask_num + j) * stride + k * w + l;
+            auto obj_idx = (i * mask_num + j) * stride + k * w + l;
             obj_mask_data[obj_idx] = static_cast<T>(-1);
           }
           // all losses should be calculated if best IoU
@@ -339,7 +339,7 @@ void YoloLossKernel(const Context& dev_ctx,
                                stride,
                                score);
 
-        int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
+        auto obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
         obj_mask_data[obj_idx] = score;
 
         int label = gt_label_data[i * b + t];
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index 05733300c9a23c..64958823f053a9 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -98,7 +98,7 @@ static int GetVectorizedSize(const DenseTensor* tensor) {
     return 1;
   }
   constexpr int max_load_bits = 128;
-  int valid_vec_size = max_load_bits / CHAR_BIT / element_size;
+  auto valid_vec_size = max_load_bits / CHAR_BIT / element_size;
   uint64_t address = reinterpret_cast<uint64_t>(tensor->data());
 
   // Currently, decide to deal with no more than 4 data once while adopting
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 2c5b59ba4b8f6a..4a6376e372ce11 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1620,13 +1620,13 @@ void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
     int sub_width = W2 / head_number;
 
     for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
+      auto sub_matA_offset = (transA == CblasNoTrans)
+                                 ? i * (W1 / head_number)
+                                 : i * (W1 / head_number) * H1;
+      auto sub_matB_offset = (transB == CblasNoTrans)
+                                 ? i * (W2 / head_number)
+                                 : i * (W2 / head_number) * H2;
+      auto sub_matC_offset = i * W2 / head_number;
       for (int k = 0; k < batchCount; ++k) {
         a_array[k] = &A[k * strideA] + sub_matA_offset;
         b_array[k] = &B[k * strideB] + sub_matB_offset;
@@ -1665,12 +1665,12 @@ void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
     int sub_width = W1 / head_number;
 
     for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
+      auto sub_matA_offset = (transA == CblasNoTrans)
+                                 ? i * (W1 / head_number)
+                                 : i * (W1 / head_number) * H1;
+      auto sub_matB_offset = (transB == CblasNoTrans)
+                                 ? i * (W1 / head_number) * W2
+                                 : i * (W1 / head_number);
       int sub_matC_offset = i * W2;
       for (int k = 0; k < batchCount; ++k) {
         a_array[k] = &A[k * strideA] + sub_matA_offset;
diff --git a/paddle/phi/kernels/funcs/block_radix_topk.cuh b/paddle/phi/kernels/funcs/block_radix_topk.cuh
index 6958bbe834721f..06870d4bcfce77 100644
--- a/paddle/phi/kernels/funcs/block_radix_topk.cuh
+++ b/paddle/phi/kernels/funcs/block_radix_topk.cuh
@@ -65,7 +65,7 @@ class BlockRadixTopKGlobalMemory {
     assert(k < size && k > 0);
     int target_k = k;
     UnsignedBits key_pattern = 0;
-    int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS;
+    auto digit_pos = sizeof(KeyT) * 8 - RADIX_BITS;
     for (; digit_pos >= 0; digit_pos -= RADIX_BITS) {
       UpdateSharedBins(data, size, digit_pos, key_pattern);
       InclusiveScanBins();
@@ -239,7 +239,7 @@ class BlockRadixTopKRegister {
 
 #pragma unroll
     for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      int idx = KEY * BLOCK_SIZE + tid_;
+      auto idx = KEY * BLOCK_SIZE + tid_;
       unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
       if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY];
       if (idx < valid_count) search_mask_ |= (1U << KEY);
@@ -248,7 +248,7 @@ class BlockRadixTopKRegister {
     int target_k = k;
     int prefix_k = 0;
 
-    for (int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0;
+    for (auto digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0;
          digit_pos -= RADIX_BITS) {
       UpdateSharedBins(unsigned_keys, digit_pos, prefix_k);
       InclusiveScanBins();
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 167be9f2e0d74e..85505581514c1b 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -445,7 +445,7 @@ void LaunchBroadcastKernel(
   const int blocks = 8;
   int read_lens = configs[0].buf_len;
   auto stream = dev_ctx.x_context()->xpu_stream;
-  int main_offset = (numel / (read_lens * threads)) * read_lens * threads;
+  auto main_offset = (numel / (read_lens * threads)) * read_lens * threads;
   int tail_tid = numel % (read_lens * threads);
 
   VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, false>
@@ -465,7 +465,7 @@ void LaunchBroadcastKernel(
   auto stream = dev_ctx.stream();
   auto threads = gpu_config.GetBlockSize();
   auto blocks = gpu_config.block_per_grid;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  auto main_offset = (numel / (VecSize * threads)) * VecSize * threads;
   int tail_tid = numel % (VecSize * threads);
 
   if (classifier.all_elementwise) {
diff --git a/paddle/phi/kernels/funcs/correlation_funcs.cu.h b/paddle/phi/kernels/funcs/correlation_funcs.cu.h
index db121f7119e702..446688003cbda1 100644
--- a/paddle/phi/kernels/funcs/correlation_funcs.cu.h
+++ b/paddle/phi/kernels/funcs/correlation_funcs.cu.h
@@ -84,8 +84,8 @@ __global__ void channel_first(const T *input,
   int64_t global_idx = static_cast<int64_t>(blockIdx.x);
   int64_t stride = static_cast<int64_t>(gridDim.x);
 
-  int p_H = H + 2 * pad_size;
-  int p_W = W + 2 * pad_size;
+  auto p_H = H + 2 * pad_size;
+  auto p_W = W + 2 * pad_size;
   int64_t p_dimcw = channel * p_W;
   int64_t p_dimchw = channel * p_H * p_W;
 
diff --git a/paddle/phi/kernels/funcs/correlation_funcs.h b/paddle/phi/kernels/funcs/correlation_funcs.h
index 6f2ddc6ab2da3c..745d256233c050 100644
--- a/paddle/phi/kernels/funcs/correlation_funcs.h
+++ b/paddle/phi/kernels/funcs/correlation_funcs.h
@@ -30,8 +30,8 @@ inline std::vector<int64_t> CorrelationOutputSize(int batch,
   std::vector<int64_t> output_shape({batch});
   int kernel_radius = (kernel_size - 1) / 2;
   int border_radius = kernel_radius + max_displacement;
-  int padded_input_height = input_height + 2 * pad_size;
-  int padded_input_width = input_width + 2 * pad_size;
+  auto padded_input_height = input_height + 2 * pad_size;
+  auto padded_input_width = input_width + 2 * pad_size;
   int output_channel = ((max_displacement / stride2) * 2 + 1) *
                        ((max_displacement / stride2) * 2 + 1);
   output_shape.push_back(output_channel);
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index 879c3b3a1ddc9d..620729ad06356e 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -86,8 +86,9 @@ inline void ModulatedDeformableIm2colCPUKernel(
         }
         *data_col_ptr = val;
         if (data_mask_ptr) {
-          const int data_mask_hw_ptr =
-              ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+          const auto data_mask_hw_ptr(
+              ((i * kernel_w + j) * height_col + h_col) * width_col + w_col);
+
           const T mask = data_mask_ptr[data_mask_hw_ptr];
           *data_col_ptr *= mask;
         }
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index b491cbe120d06f..95c671686d4745 100644
--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -129,7 +129,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
   T b0[Tiled_size];
 
   int COL = blockIdx.x * blockDim.x + threadIdx.x;
-  int Tiled_mask = ((1 << Tiled_size) - 1);
+  auto Tiled_mask = ((1 << Tiled_size) - 1);
   // Tiled  matrix multiply using register shift, faster than sm.
   if (prev_output_value) {
     for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) {
@@ -191,7 +191,7 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
   T b0[Tiled_size];
   T c0 = 0.0f;
 
-  int Tiled_mask = ((1 << Tiled_size) - 1);
+  auto Tiled_mask = ((1 << Tiled_size) - 1);
   //- Tiled  matrix multiply with register shift
   if (prev_out_value) {
     for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) {
diff --git a/paddle/phi/kernels/funcs/detection/bbox_util.cu.h b/paddle/phi/kernels/funcs/detection/bbox_util.cu.h
index f60b6d2e584794..b7ae6124a844f4 100644
--- a/paddle/phi/kernels/funcs/detection/bbox_util.cu.h
+++ b/paddle/phi/kernels/funcs/detection/bbox_util.cu.h
@@ -214,7 +214,7 @@ static __global__ void FilterBBoxes(const T *bboxes,
     }
     __syncthreads();
     if (threadIdx.x == 0) {
-      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      auto size = (num - i) < BlockSize ? num - i : BlockSize;
       for (int j = 0; j < size; ++j) {
         if (keep_index[j] > -1) {
           keep[cnt++] = keep_index[j];
diff --git a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
index 7859f39aaa48e3..16741164b30783 100644
--- a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
+++ b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
@@ -123,7 +123,7 @@ static void FusedElemwiseAndActBroadcast1CPU(const T *x,
                                              T *intermediate_out) {
   for (int i = 0; i < h; ++i) {
     for (int j = 0; j < w; ++j) {
-      int offset = i * w + j;
+      auto offset = i * w + j;
 
       T y_val = BcastY ? y[j] : y[offset];
       T x_val = BcastY ? x[offset] : x[j];
@@ -171,7 +171,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x,
   for (int i = 0; i < pre; ++i) {
     for (int j = 0; j < n; ++j) {
       for (int k = 0; k < post; ++k) {
-        int offset = i * n * post + j * post + k;
+        auto offset = i * n * post + j * post + k;
 
         T y_val = BcastY ? y[j] : y[offset];
         T x_val = BcastY ? x[offset] : x[j];
@@ -219,7 +219,7 @@ static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
   int j = threadIdx.x;
 
   while (j < w) {
-    int offset = i * w + j;
+    auto offset = i * w + j;
 
     T y_val = BcastY ? y[j] : y[offset];
     T x_val = BcastY ? x[offset] : x[j];
@@ -295,7 +295,7 @@ static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel(
     int k = tid % post;
     if (i >= pre) break;
 
-    int offset = i * n * post + j * post + k;
+    auto offset = i * n * post + j * post + k;
 
     T y_val = BcastY ? y[j] : y[offset];
     T x_val = BcastY ? x[offset] : x[j];
@@ -596,7 +596,7 @@ static void FusedElemwiseAndActGradBroadcast1CPU(
   T zero = static_cast<T>(0);
   for (int i = 0; i < h; ++i) {
     for (int j = 0; j < w; ++j) {
-      int offset = i * w + j;
+      auto offset = i * w + j;
 
       tmp_out_idx = BcastY ? j : offset;
       y_idx = BcastY ? j : offset;
@@ -694,7 +694,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
   for (int i = 0; i < pre; ++i) {
     for (int j = 0; j < n; ++j) {
       for (int k = 0; k < post; ++k) {
-        int offset = i * n * post + j * post + k;
+        auto offset = i * n * post + j * post + k;
 
         tmp_out_idx = BcastY ? j : offset;
         y_idx = BcastY ? j : offset;
@@ -988,7 +988,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
     int k = ttid % post;
     if (i >= pre) break;
 
-    int offset = i * n * post + j * post + k;
+    auto offset = i * n * post + j * post + k;
 
     tmp_out_idx = BcastY ? j : offset;
     y_idx = BcastY ? j : offset;
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index cb35feee328a75..b83de76474ce50 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -107,7 +107,8 @@ void AddReluKernel(
     gpuStream_t stream, const int M, const int N, T* Y, const T* B, bool relu) {
   if (N % 4 == 0) {
     const int threads = 256;
-    const int num = M * N / 4;
+    const auto num(M * N / 4);
+
     const int blocks = (num + threads - 1) / threads;
     typedef typename FcTypeTraits<T>::Type trans_type;
     auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(B);
@@ -223,8 +224,10 @@ void LaunchBiasAddReluHalf2Kernel(cudaStream_t stream,
                                   const float16* B,
                                   bool relu) {
   const int threads = 256;
-  const int vec_num = rows * cols / (Half2VecSize * 2);
-  const int half2_num = rows * cols / 2;
+  const auto vec_num(rows * cols / (Half2VecSize * 2));
+
+  const auto half2_num(rows * cols / 2);
+
   const int blocks = (vec_num + threads - 1) / threads;
   // Here reinterpret_cast to half2 type.
   typedef typename FcTypeTraits<float16>::Type trans_type;
@@ -308,7 +311,8 @@ void AddReluKernel(gpuStream_t stream,
                    bool relu) {
   if (N % 4 == 0) {
     const int threads = 256;
-    const int num = M * N / 4;
+    const auto num(M * N / 4);
+
     const int blocks = (num + threads - 1) / threads;
     typedef typename FcTypeTraits<float16>::Type trans_type;
     auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(B);
diff --git a/paddle/phi/kernels/funcs/im2col.cc b/paddle/phi/kernels/funcs/im2col.cc
index a6478f01c19422..4ccad15f0975e1 100644
--- a/paddle/phi/kernels/funcs/im2col.cc
+++ b/paddle/phi/kernels/funcs/im2col.cc
@@ -121,7 +121,7 @@ class Col2ImFunctor<phi::funcs::ColFormat::kCFO, DeviceContext, T> {
         common::errors::InvalidArgument("Output_height and padding(padding_up, "
                                         "padding_down) are inconsistent."));
 
-    int channels_col = im_channels * filter_height * filter_width;
+    auto channels_col = im_channels * filter_height * filter_width;
 
     T* im_data = im->data<T>();
     const T* col_data = col.data<T>();
@@ -131,9 +131,9 @@ class Col2ImFunctor<phi::funcs::ColFormat::kCFO, DeviceContext, T> {
       int h_offset = (c / filter_width) % filter_height;
       int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        auto im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          auto im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
           if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
               (im_col_idx) >= 0 && (im_col_idx) < im_width) {
             int im_offset = 0;
@@ -215,14 +215,14 @@ class Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
-            int im_row_offset =
+            auto im_row_offset =
                 col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_col_offset =
+              auto im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
 
-              int col_offset =
+              auto col_offset =
                   ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                     channel) *
                        filter_height +
@@ -230,8 +230,9 @@ class Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
                       filter_width +
                   filter_col_idx;
 
-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
+              auto im_offset =
+                  (channel * im_height + im_row_offset) * im_width +
+                  im_col_offset;
               col_data[col_offset] =
                   (im_row_offset < 0 || im_row_offset >= im_height ||
                    im_col_offset < 0 || im_col_offset >= im_width)
@@ -300,14 +301,14 @@ class Col2ImFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
-            int im_row_offset =
+            auto im_row_offset =
                 col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_col_offset =
+              auto im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
 
-              int col_offset =
+              auto col_offset =
                   (((col_row_idx * col_width + col_col_idx) * im_channels +
                     channel) *
                        filter_height +
@@ -317,7 +318,7 @@ class Col2ImFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
 
               if (im_row_offset >= 0 && im_row_offset < im_height &&
                   im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
+                auto im_offset =
                     (channel * im_height + im_row_offset) * im_width +
                     im_col_offset;
                 im_data[im_offset] += col_data[col_offset];
diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
index cea94f97453d04..74f8037a2f393a 100644
--- a/paddle/phi/kernels/funcs/im2col.cu
+++ b/paddle/phi/kernels/funcs/im2col.cu
@@ -41,8 +41,8 @@ __global__ void im2col(const T* data_im,
                        int col_width,
                        T* data_col,
                        const DataLayout data_layout) {
-  int input_channels = num_outs / col_height / col_width;
-  int channels_col = input_channels * filter_height * filter_width;
+  auto input_channels = num_outs / col_height / col_width;
+  auto channels_col = input_channels * filter_height * filter_width;
   const int index =
       (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if (index < num_outs) {
@@ -55,15 +55,15 @@ __global__ void im2col(const T* data_im,
     int channel_in =
         (data_layout != DataLayout::kNHWC ? index / col_width / col_height
                                           : index % input_channels);
-    int channel_out = channel_in * filter_height * filter_width;
-    int h_in = h_out * stride_height - padding_height;
-    int w_in = w_out * stride_width - padding_width;
+    auto channel_out = channel_in * filter_height * filter_width;
+    auto h_in = h_out * stride_height - padding_height;
+    auto w_in = w_out * stride_width - padding_width;
 
     data_col += (channel_out * col_height + h_out) * col_width + w_out;
     for (int i = 0; i < filter_height; ++i) {
       for (int j = 0; j < filter_width; ++j) {
-        int rIdx = h_in + i * dilation_h;
-        int cIdx = w_in + j * dilation_w;
+        auto rIdx = h_in + i * dilation_h;
+        auto cIdx = w_in + j * dilation_w;
         int im_idx;
         if (data_layout != DataLayout::kNHWC) {
           im_idx = (channel_in * im_height + rIdx) * im_width + cIdx;
@@ -126,7 +126,7 @@ class Im2ColFunctor<phi::funcs::ColFormat::kCFO, DeviceContext, T> {
     int col_height = col->dims()[3];
     int col_width = col->dims()[4];
 
-    int num_outputs = im_channels * col_height * col_width;
+    auto num_outputs = im_channels * col_height * col_width;
     int num_thread = 1024;
 #ifdef WITH_NV_JETSON
     phi::backends::gpu::ChangeThreadNum(dev_ctx, &num_thread);
@@ -175,10 +175,11 @@ __global__ void col2im(int n,
   const int index =
       (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
 
-  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
-  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+  const auto d_filter_height(dilation_h * (filter_height - 1) + 1);
 
-  int input_channels = n / im_height / im_width;
+  const auto d_filter_width(dilation_w * (filter_width - 1) + 1);
+
+  auto input_channels = n / im_height / im_width;
 
   if (index < n) {
     T val = static_cast<T>(0);
@@ -193,21 +194,21 @@ __global__ void col2im(int n,
                                               : index % input_channels);
 
     // compute the start and end of the output
-    int w_col_start =
+    auto w_col_start =
         (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
     int w_col_end = min(w / stride_width + 1, col_width);
-    int h_col_start =
+    auto h_col_start =
         (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
     int h_col_end = min(h / stride_height + 1, col_height);
 
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        int h_off = (h - h_col * stride_height);
-        int w_off = (w - w_col * stride_width);
+        auto h_off = (h - h_col * stride_height);
+        auto w_off = (w - w_col * stride_width);
         if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
           h_off /= dilation_h;
           w_off /= dilation_w;
-          int data_col_index =
+          auto data_col_index =
               (((c * filter_height + h_off) * filter_width + w_off) *
                    col_height +
                h_col) *
@@ -358,15 +359,15 @@ __global__ void im2colOCF(const T* im_data,
        channelid += blockDim.z) {
     for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
-        int width_offset = idx + swid * stride_width - padding_width;
-        int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * im_width +
-                        channelid * im_height * im_width;
+        auto width_offset = idx + swid * stride_width - padding_width;
+        auto height_offset = idy + shid * stride_height - padding_height;
+        auto im_offset = width_offset + height_offset * im_width +
+                         channelid * im_height * im_width;
 
-        int col_offset = idx + idy * filter_width +
-                         channelid * filter_height * filter_width +
-                         (shid * col_width + swid) *
-                             (im_channels * filter_height * filter_width);
+        auto col_offset = idx + idy * filter_width +
+                          channelid * filter_height * filter_width +
+                          (shid * col_width + swid) *
+                              (im_channels * filter_height * filter_width);
 
         col_data[col_offset] =
             (height_offset >= im_height || height_offset < 0 ||
@@ -430,7 +431,7 @@ class Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
       block_dim_y = 32;
     }
 
-    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    auto block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
     im2colOCF<T><<<grid, threads, 0, dev_ctx.stream()>>>(im.data<T>(),
@@ -469,15 +470,15 @@ __global__ void col2imOCF(const T* col_data,
        channelid += blockDim.z) {
     for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
-        int width_offset = idx + swid * stride_width - padding_width;
-        int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * im_width +
-                        channelid * im_height * im_width;
+        auto width_offset = idx + swid * stride_width - padding_width;
+        auto height_offset = idy + shid * stride_height - padding_height;
+        auto im_offset = width_offset + height_offset * im_width +
+                         channelid * im_height * im_width;
 
-        int col_offset = idx + idy * filter_width +
-                         channelid * filter_height * filter_width +
-                         (shid * col_width + swid) *
-                             (im_channels * filter_height * filter_width);
+        auto col_offset = idx + idy * filter_width +
+                          channelid * filter_height * filter_width +
+                          (shid * col_width + swid) *
+                              (im_channels * filter_height * filter_width);
 
         if (height_offset >= 0 && height_offset < im_height &&
             width_offset >= 0 && width_offset < im_width) {
@@ -557,7 +558,7 @@ class Col2ImFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
       block_dim_y = 32;
     }
 
-    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    auto block_dim_z = 1024 / block_dim_x / block_dim_y;
     dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
     dim3 grid(col_width, col_height);
     col2imOCF<T><<<grid, threads, 0, dev_ctx.stream()>>>(col.data<T>(),
diff --git a/paddle/phi/kernels/funcs/im2col_cfo_cpu.h b/paddle/phi/kernels/funcs/im2col_cfo_cpu.h
index 1e639f1787cfec..545c01b1947041 100644
--- a/paddle/phi/kernels/funcs/im2col_cfo_cpu.h
+++ b/paddle/phi/kernels/funcs/im2col_cfo_cpu.h
@@ -42,7 +42,7 @@ inline void im2col_common(const phi::DenseTensor& im,
   int filter_width = col->dims()[2];
   int output_height = col->dims()[3];
   int output_width = col->dims()[4];
-  int channels_col = im_channels * filter_height * filter_width;
+  auto channels_col = im_channels * filter_height * filter_width;
 
   // Convert dimensions to 64-bit to prevent overflow in arithmetic operations
   const int64_t im_channels64 = im_channels;
@@ -58,9 +58,9 @@ inline void im2col_common(const phi::DenseTensor& im,
     int h_offset = (c / filter_width) % filter_height;
     int c_im = c / (filter_width * filter_height);
     for (int h = 0; h < output_height; ++h) {
-      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      auto im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
       for (int w = 0; w < output_width; ++w) {
-        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        auto im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
 
         // Calculate col_idx using 64-bit arithmetic to prevent overflow
         int64_t col_idx64 =
@@ -223,7 +223,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
           }
           if (data_layout != DataLayout::kNHWC) {
             // Safe memcpy for filter_width == 1 case
-            int want = output_width - plw - prw;
+            auto want = output_width - plw - prw;
             int avail = im_width;
             int n = std::max(0, std::min(want, avail));
             if (n > 0) {
@@ -236,7 +236,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
             }
           } else {
             for (int kow = 0; kow < output_width - plw - prw; ++kow) {
-              int im_row = oh - plh + kh;
+              auto im_row = oh - plh + kh;
               int im_col = kow;
               if (im_row >= 0 && im_row < im_height && im_col >= 0 &&
                   im_col < im_width) {
@@ -311,7 +311,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
             }
           } else {
             for (int kow = 0; kow < output_width - (plw - kw); ++kow) {
-              int im_row = oh - plh + kh;
+              auto im_row = oh - plh + kh;
               int im_col = kow;
               if (im_row >= 0 && im_row < im_height && im_col >= 0 &&
                   im_col < im_width) {
@@ -339,8 +339,8 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
             }
           } else {
             for (int kow = 0; kow < output_width; ++kow) {
-              int im_row = oh - plh + kh;
-              int im_col = kw - plw + kow;
+              auto im_row = oh - plh + kh;
+              auto im_col = kw - plw + kow;
               if (im_row >= 0 && im_row < im_height && im_col >= 0 &&
                   im_col < im_width) {
                 dst_data[kow] =
@@ -368,8 +368,8 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
             }
           } else {
             for (int kow = 0; kow < output_width - i; ++kow) {
-              int im_row = oh - plh + kh;
-              int im_col = kw - plw + kow;
+              auto im_row = oh - plh + kh;
+              auto im_col = kw - plw + kow;
               if (im_row >= 0 && im_row < im_height && im_col >= 0 &&
                   im_col < im_width) {
                 dst_data[kow] =
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index 0e14e613468109..d952bf17549a62 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -161,7 +161,7 @@ static phi::DDim BroadCastTensorsDims(
     int target_dim_size = 1;
     for (const auto& tensor : tensors) {
       auto input_ddim = tensor->dims();
-      int axis = static_cast<int>(input_ddim.size()) - index - 1;
+      auto axis = static_cast<int>(input_ddim.size()) - index - 1;
       int dim_size = 1;
       if (axis >= 0) {
         dim_size = input_ddim[axis];
diff --git a/paddle/phi/kernels/funcs/jit/gen/seqpool.cc b/paddle/phi/kernels/funcs/jit/gen/seqpool.cc
index 484bff22be4ea5..4ed94f6c1b5fcf 100644
--- a/paddle/phi/kernels/funcs/jit/gen/seqpool.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/seqpool.cc
@@ -40,7 +40,8 @@ void SeqPoolJitCode::genCode() {
     vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
     vmovss(ptr[reg_tmp], xmm_t(1));
   }
-  const int group_len = max_num_regs * block * sizeof(float);
+  const auto group_len(max_num_regs * block * sizeof(float));
+
   for (int g = 0; g < num_groups; ++g) {
     pool_height<ymm_t>(g * group_len, block, max_num_regs);
   }
diff --git a/paddle/phi/kernels/funcs/jit/gen_base.cc b/paddle/phi/kernels/funcs/jit/gen_base.cc
index 71701b96f3b640..81ffa663c201cf 100644
--- a/paddle/phi/kernels/funcs/jit/gen_base.cc
+++ b/paddle/phi/kernels/funcs/jit/gen_base.cc
@@ -81,7 +81,8 @@ std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
   }
   // one for x, one for y, others for z
   const int max_used_regs_for_n = max_num_regs - 2;
-  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
+  const auto aligned_n(n % block == 0 ? n : (n / block + 1) * block);
+
   const int num_block = aligned_n / block;
   const int num_groups = num_block / max_used_regs_for_n;
   std::vector<int> groups(num_groups, max_used_regs_for_n);
diff --git a/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc b/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc
index 43a011277cb5ff..3c742b378f4f3d 100644
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc
@@ -83,7 +83,7 @@ void CRFDecoding(const int seq_len,
       __m256i max_j = _mm256_set1_epi32(0);
 #endif
       /* Calculate the offset of transition_weights.*/
-      int trans_offset = state_trans_base_idx * tag_num + j_offset;
+      auto trans_offset = state_trans_base_idx * tag_num + j_offset;
       for (int i = 0; i < tag_num; ++i) {
 /* Initialize the content of alpha variable with related offset.*/
 #ifdef __AVX512F__
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 4eae698648996b..1a7e48e6e0301d 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -223,8 +223,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
   const int warp_n = warp % WARPS_N;         // 0
   const int warp_m = warp / WARPS_N;         // 0, 1, 2, 3
 
-  const int c = warp_n * THREADS_PER_WARP + lane;  // lane
-  const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
+  const auto c(warp_n * THREADS_PER_WARP + lane);
+  // lane
+  const auto r(bidx * ROWS_PER_CTA + warp_m);
+  // row id
 
   Vec_scale gamma[LDGS];
   Vec_scale beta[LDGS];
@@ -874,9 +876,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_fast_final_kernel(
   const int warp = tidx / THREADS_PER_WARP;
   const int warp_m = warp / WARPS_N;
   const int warp_n = warp % WARPS_N;
-  const int tid_c = warp_n * THREADS_PER_WARP + lane;
+  const auto tid_c(warp_n * THREADS_PER_WARP + lane);
+
+  const auto c(bidx * THREADS_PER_ROW + tid_c);
 
-  const int c = bidx * THREADS_PER_ROW + tid_c;
   const int r = warp_m;
 
   __shared__ U smem_space[(WARPS_M - 1) * THREADS_PER_ROW * VecSize];
@@ -1017,7 +1020,7 @@ void ln_bwd_fast_kernel_driver(const phi::GPUContext &dev_ctx,
     const int ROWS_PER_CTA = WARPS_M;
 
     // 4 * 1024 * 4
-    const int SMEM_BYTES = ROWS_PER_CTA * cols * sizeof(U);
+    const auto SMEM_BYTES(ROWS_PER_CTA * cols * sizeof(U));
 
     // #blocks = 2 * #SM
     const int gridx = 2 * dev_ctx.GetSMCount();
@@ -1591,7 +1594,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
         VecT temp_grad;
 #pragma unroll
         for (int k = 0; k < DataPerTid; ++k) {
-          const int idx = i * DataPerTid + k;
+          const auto idx(i * DataPerTid + k);
+
           const U c_h = input_data[idx];
           const U c_loss = dout_data[idx];
           U f_grad_input = fH * c_loss * gamma_data[idx] - sum_loss1;
@@ -1606,7 +1610,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize(
         VecT temp_grad;
 #pragma unroll
         for (int k = 0; k < DataPerTid; ++k) {
-          const int idx = i * DataPerTid + k;
+          const auto idx(i * DataPerTid + k);
+
           const U c_h = input_data[idx];
           const U c_loss = dout_data[idx];
           U f_grad_input = fH * c_loss - sum_loss1;
diff --git a/paddle/phi/kernels/funcs/math/beam_search.cu b/paddle/phi/kernels/funcs/math/beam_search.cu
index 66c0b1951585b1..d51e3424b4419a 100644
--- a/paddle/phi/kernels/funcs/math/beam_search.cu
+++ b/paddle/phi/kernels/funcs/math/beam_search.cu
@@ -98,7 +98,7 @@ __device__ __forceinline__ int SelectTopBeam(Triple* top_beam,
           Insert(top_beam_local, tmp, beam_size);
         }
       } else {
-        int index = offset * seq_width + tid_of_seq;
+        auto index = offset * seq_width + tid_of_seq;
         if (!IsAccumulated) {
           float pre_score = pre_scores[offset];
           for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
@@ -263,7 +263,8 @@ __device__ void BeamSearchDetails(int64_t* selected_ids,
     int selected_seq_length = finish_flag ? 0 : num_items;
 
     if (MaxSeqs > 1) {
-      const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+      const auto seq_id((MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid);
+
       __shared__ int shared_mem[MaxSeqs];
 
       // [0, MaxSeqs - 1], length of each sequences
@@ -322,7 +323,7 @@ __global__ void BeamSearchKernel(int64_t* selected_ids,
                                  bool is_accumulated,
                                  int num_used_threads) {
   const int tid = threadIdx.x;
-  const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+  const auto seq_id((MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid);
 
   int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
   int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
index 15e1a4a3c3206a..90545c3ccbe3df 100644
--- a/paddle/phi/kernels/funcs/math/context_project.h
+++ b/paddle/phi/kernels/funcs/math/context_project.h
@@ -162,7 +162,7 @@ class ContextProjectFunctor {
               up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
 
           for (int k = 0; k < padding_rows; ++k) {
-            int padding_size =
+            auto padding_size =
                 k + context_length < up_pad ? context_length : up_pad - k;
             phi::DenseTensor out_t_sub = out_t.Slice(
                 k * context_length, k * context_length + padding_size);
@@ -176,7 +176,7 @@ class ContextProjectFunctor {
                        (sequence_height - context_start - context_length) + 1) +
               1;
           int padding_begin = std::max(0, context_start - sequence_height);
-          int padding_size =
+          auto padding_size =
               sequence_height - context_start >= context_length
                   ? 1
                   : context_length - (sequence_height - context_start);
@@ -292,7 +292,7 @@ class ContextProjectGradFunctor {
                 up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
 
             for (int k = 0; k < padding_rows; ++k) {
-              int padding_size =
+              auto padding_size =
                   k + context_length < up_pad ? context_length : up_pad - k;
               phi::DenseTensor out_t_sub = out_t.Slice(
                   k * context_length, k * context_length + padding_size);
@@ -309,7 +309,7 @@ class ContextProjectGradFunctor {
                     0, (sequence_height - context_start - context_length) + 1) +
                 1;
             int padding_begin = std::max(0, context_start - sequence_height);
-            int padding_size =
+            auto padding_size =
                 sequence_height - context_start >= context_length
                     ? 1
                     : context_length - (sequence_height - context_start);
diff --git a/paddle/phi/kernels/funcs/math/tree2col.cu b/paddle/phi/kernels/funcs/math/tree2col.cu
index a388072679e500..849d6ec4b011b2 100644
--- a/paddle/phi/kernels/funcs/math/tree2col.cu
+++ b/paddle/phi/kernels/funcs/math/tree2col.cu
@@ -34,7 +34,8 @@ __global__ void tree2col(const T* eta,
   const int patch_id = thread_id / feature_size;
   const int j = thread_id % feature_size;
   if (patch_id < n) {
-    const int begin_o = patch_id * 3 * feature_size;
+    const auto begin_o(patch_id * 3 * feature_size);
+
     const int begin = index[patch_id * 2], end = index[patch_id * 2 + 1];
     T res_l = 0, res_r = 0, res_t = 0;
     for (int i = begin; i < end; i++) {
diff --git a/paddle/phi/kernels/funcs/math/unpooling.cc b/paddle/phi/kernels/funcs/math/unpooling.cc
index fffbf8ef7130bc..426622e3bba8a8 100644
--- a/paddle/phi/kernels/funcs/math/unpooling.cc
+++ b/paddle/phi/kernels/funcs/math/unpooling.cc
@@ -120,8 +120,8 @@ class Unpool3dMaxFunctor<phi::CPUContext, T> {
     const int output_depth = static_cast<int>(output->dims()[2]);
     const int output_height = static_cast<int>(output->dims()[3]);
     const int output_width = static_cast<int>(output->dims()[4]);
-    int input_feasize = input_depth * input_height * input_width;
-    int output_feasize = output_depth * output_height * output_width;
+    auto input_feasize = input_depth * input_height * input_width;
+    auto output_feasize = output_depth * output_height * output_width;
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = context.template Alloc<T>(output);
@@ -168,8 +168,8 @@ class Unpool3dMaxGradFunctor<phi::CPUContext, T> {
     const int output_depth = static_cast<int>(output.dims()[2]);
     const int output_height = static_cast<int>(output.dims()[3]);
     const int output_width = static_cast<int>(output.dims()[4]);
-    int input_feasize = input_depth * input_height * input_width;
-    int output_feasize = output_depth * output_height * output_width;
+    auto input_feasize = input_depth * input_height * input_width;
+    auto output_feasize = output_depth * output_height * output_width;
     const int* indices_data = indices.data<int>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = context.template Alloc<T>(input_grad);
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
index e10122497096fb..2b4fb15b3f48a9 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -67,7 +67,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 
   // Copy the addresses of A and A_inv from host to device,
   // and allocate device memory for info and pivots.
-  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  auto num_ints = n < 32 ? batch_size : batch_size * (n + 1);
   size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
   phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
       dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/funcs/matrix_solve.h b/paddle/phi/kernels/funcs/matrix_solve.h
index 27abdf8c2c96a7..5ab94be0ce5142 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.h
+++ b/paddle/phi/kernels/funcs/matrix_solve.h
@@ -99,14 +99,14 @@ void compute_solve_eigen(const Context& dev_ctx,
   const auto& a_mat_dims = a.dims();
   const int a_rank = a_mat_dims.size();
   int n = a_mat_dims[a_rank - 1];
-  int a_batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
+  auto a_batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
 
   // prepare for b
   const auto& b_mat_dims = b.dims();
   const int b_rank = b_mat_dims.size();
   int b_h = n;
   int b_w = b_mat_dims[b_rank - 1];
-  int b_batch_size = b_rank > 2 ? b.numel() / (b_h * b_w) : 1;
+  auto b_batch_size = b_rank > 2 ? b.numel() / (b_h * b_w) : 1;
 
   const T* a_ptr = a.data<T>();
   const T* b_ptr = b.data<T>();
diff --git a/paddle/phi/kernels/funcs/maxouting.cc b/paddle/phi/kernels/funcs/maxouting.cc
index fca6d8e39553a4..ef9e09dec9dfbf 100644
--- a/paddle/phi/kernels/funcs/maxouting.cc
+++ b/paddle/phi/kernels/funcs/maxouting.cc
@@ -84,7 +84,7 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
   const T* output_grad_data = output_grad.data<T>();
   T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
   for (int i = 0; i < batch_size; ++i) {
-    int blen = fea_size * output_channels * i;
+    auto blen = fea_size * output_channels * i;
     for (int c = 0; c < output_channels; ++c) {
       int clen = fea_size * c;
       for (int f = 0; f < fea_size; ++f) {
diff --git a/paddle/phi/kernels/funcs/multi_tensor_apply_util.h b/paddle/phi/kernels/funcs/multi_tensor_apply_util.h
index e146005c49a697..8523559779f673 100644
--- a/paddle/phi/kernels/funcs/multi_tensor_apply_util.h
+++ b/paddle/phi/kernels/funcs/multi_tensor_apply_util.h
@@ -89,11 +89,13 @@ static __global__ void MultiTensorApplyCUDAKernel(
     Args... args) {
   const int block_id = blockIdx.x;
   const int tensor_id = meta.tensor_ids[block_id];
-  const int chunk_id = static_cast<int>(meta.chunk_ids[block_id]) +
-                       (tensor_id == 0) * meta.start_chunk_id;
+  const auto chunk_id(static_cast<int>(meta.chunk_ids[block_id]) +
+                      (tensor_id == 0) * meta.start_chunk_id);
+
   const int prev_offset = meta.offsets[tensor_id];
   const int next_offset = meta.offsets[tensor_id + 1];
-  const int ptr_offset = prev_offset + chunk_id * chunk_size;
+  const auto ptr_offset(prev_offset + chunk_id * chunk_size);
+
   const int size = min(next_offset - ptr_offset, chunk_size);
 
   functor(
diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
index b41106a6368d7b..d7f54f6b0c0a4d 100644
--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
@@ -546,7 +546,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &dev_ctx,
                    seq_len * size_per_head);
 
   if (seq_len <= 1024) {
-    int grid = batch_size * head_num * seq_len;
+    auto grid = batch_size * head_num * seq_len;
     int block = seq_len;
 
     // Align block to 32, also limit seq_len to max block size.
@@ -594,7 +594,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &dev_ctx,
           qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
     }
   } else {
-    int grid = batch_size * head_num * seq_len;
+    auto grid = batch_size * head_num * seq_len;
     int block = 512;
     if (seq_len % 2 == 0) {
       if (std::is_same<T, float>::value) {
@@ -694,7 +694,7 @@ void MultiheadGPUComputeFunctor<T>::operator()(const phi::GPUContext &dev_ctx,
                                                T alpha,
                                                T beta) {
   auto stream = dev_ctx.stream();
-  const int tsize = batch * head_num * seq_len * head_size;
+  const auto tsize(batch * head_num * seq_len * head_size);
 
   T *qptr = tptr;
   T *kptr = qptr + tsize;
diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index 4b1ed6ddb9c9e6..238a306858a12a 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -459,7 +459,8 @@ void NormDoubleGradFunctor(const DeviceContext &dev_ctx,
                                                   : x_dims[x_dims.size() - 1]);
   const int N = x_dims[0];
   const int64_t num = X->numel();
-  const int sample_size = num / N / C;
+  const auto sample_size(num / N / C);
+
   phi::DenseTensor scale_tmp;
   if (!Scale) {
     scale_tmp.Resize({C});
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 06bcee3be384c1..51219bb48392fe 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -1518,8 +1518,8 @@ void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
   const int padding_height = paddings[1];
   const int padding_width = paddings[2];
 
-  int nthreads = batch_size * output_channels * output_depth * output_height *
-                 output_width;
+  auto nthreads = batch_size * output_channels * output_depth * output_height *
+                  output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
   thread_num = 512;
diff --git a/paddle/phi/kernels/funcs/sequence_padding.cc b/paddle/phi/kernels/funcs/sequence_padding.cc
index 3eb20dec6afcd2..b0110d618f4489 100644
--- a/paddle/phi/kernels/funcs/sequence_padding.cc
+++ b/paddle/phi/kernels/funcs/sequence_padding.cc
@@ -37,7 +37,7 @@ void CopyValidData(phi::DenseTensor* dst_tensor,
   T* dst_data = dst_tensor->data<T>();
 
   int seq_cpy_gap = step_width;
-  int pad_cpy_gap =
+  auto pad_cpy_gap =
       layout == kBatchLengthWidth ? step_width : seq_num * step_width;
   for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
     int valid_seq_len =
@@ -54,9 +54,9 @@ void CopyValidData(phi::DenseTensor* dst_tensor,
             pad_seq_len,
             valid_seq_len));
     int seq_data_offset = static_cast<int>(seq_offsets[seq_idx] * step_width);
-    int pad_data_offset = layout == kBatchLengthWidth
-                              ? seq_idx * pad_seq_len * step_width
-                              : seq_idx * step_width;
+    auto pad_data_offset = layout == kBatchLengthWidth
+                               ? seq_idx * pad_seq_len * step_width
+                               : seq_idx * step_width;
     float scale = 1.0f / static_cast<float>(valid_seq_len);
 
     for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index f27174d5818186..a218bdb896b14f 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -48,7 +48,7 @@ __global__ void ScatterKernel(const T* input,
   for (int i = tid; i < non_zero_num * vec_channels;
        i += gridDim.x * blockDim.x) {
     int indices_i = i / vec_channels;
-    int channels_i = i - indices_i * vec_channels;
+    auto channels_i = i - indices_i * vec_channels;
 
     int start = unique_value[indices_i];
     int end = indices_i == non_zero_num - 1 ? rulebook_len
@@ -89,14 +89,15 @@ __global__ void ScatterKernelV2(const T* input,
   for (int i = tid; i < non_zero_num * vec_channels;
        i += gridDim.x * blockDim.x) {
     int indices_i = i / vec_channels;
-    int channels_i = i - indices_i * vec_channels;
+    auto channels_i = i - indices_i * vec_channels;
 
     StoreT sums = {static_cast<T>(0)};
     phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
                           &sums);
     for (int it = 0; it < buffer_counts; it++) {
       int len = index_counts[indices_i + it * non_zero_num];
-      const int group_offset = it * kernel_size * non_zero_num;
+      const auto group_offset(it * kernel_size * non_zero_num);
+
       for (int j = 0; j < len; j++) {
         const int out_feature_i =
             index_groups[indices_i * kernel_size + j + group_offset];
diff --git a/paddle/phi/kernels/funcs/stack_functor.h b/paddle/phi/kernels/funcs/stack_functor.h
index a84967ad7111b3..d73137b15049ac 100644
--- a/paddle/phi/kernels/funcs/stack_functor.h
+++ b/paddle/phi/kernels/funcs/stack_functor.h
@@ -26,8 +26,8 @@ struct StackFunctor {
 
   HOSTDEVICE void operator()(int idx) {
     int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
+    auto which_x = idx / post_ - i * n_;
+    auto x_index = i * post_ + idx % post_;
     y_[idx] = x_[which_x][x_index];
   }
 
@@ -45,8 +45,8 @@ struct StackGradFunctor {
 
   HOSTDEVICE void operator()(int idx) {
     int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
+    auto which_x = idx / post_ - i * n_;
+    auto x_index = i * post_ + idx % post_;
     if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx];
   }
 
diff --git a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h
index 0715cec7fc8215..bd422351768691 100644
--- a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h
@@ -143,8 +143,8 @@ __global__ void KeBackwardLocalStats(const T *dy,
     BatchNormParamType<T> sum2 = 0.;
     auto mean = means[k];
     for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
-      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
-                                           : i * C + k;
+      auto id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
+                                            : i * C + k;
       auto g = static_cast<BatchNormParamType<T>>(dy[id]);
       sum1 += g;
       auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
@@ -187,8 +187,8 @@ __global__ void KeBackwardLocalStats2D(const T *dy,
     auto mean = means[k];
     for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < N * M;
          i += gridDim.y * blockDim.y) {
-      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
-                                           : i * C + k;
+      auto id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
+                                            : i * C + k;
       auto g = static_cast<BatchNormParamType<T>>(dy[id]);
       sum1 += g;
       auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
@@ -247,9 +247,10 @@ static __global__ void KeBNBackwardScaleBias(
     auto inv_var_i = inv_variance[i];
     auto mean_i = mean[i];
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int id = layout == DataLayout::kNCHW
-                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
-                         : j * outer_size + i;
+      const auto id(layout == DataLayout::kNCHW
+                        ? ((j / HxW) * C + i) * HxW + (j % HxW)
+                        : j * outer_size + i);
+
       auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
       auto dy_i = static_cast<BatchNormParamType<T>>(dy[id]);
       ds_sum += dy_i * (x_i - mean_i);
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index e30d440ff3273c..54702098ae0e52 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -793,7 +793,7 @@ __device__ void RadixSearch(const T* input,
   RadixType desired_mask = 0;
 
 #pragma unroll
-  for (int digit_pos = sizeof(T) * 8 - RADIX_BITS; digit_pos >= 0;
+  for (auto digit_pos = sizeof(T) * 8 - RADIX_BITS; digit_pos >= 0;
        digit_pos -= RADIX_BITS) {
     RadixCountUsingMask<T, RadixType, IndexType, RADIX_SIZE, RADIX_BITS>(
         input,
diff --git a/paddle/phi/kernels/funcs/transpose_function.cu.h b/paddle/phi/kernels/funcs/transpose_function.cu.h
index 59daa0b8d73c89..2ba8f750aa5bae 100644
--- a/paddle/phi/kernels/funcs/transpose_function.cu.h
+++ b/paddle/phi/kernels/funcs/transpose_function.cu.h
@@ -153,7 +153,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
   if (x < in_effective_thread_num) {
     // Read a tile from input using block.
     int x_i = x / TileY;
-    int x_j = x - x_i * TileY;
+    auto x_j = x - x_i * TileY;
     IndexType input_ind =
         input_origin_block_flat_index + x_i * input_dims[2] + x_j;
     IndexType input_inc = BlockReadRows * input_dims[2];
@@ -197,7 +197,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input,
 
   if (x < out_effective_thread_num) {
     int x_i = x / TileX;
-    int x_j = x - x_i * TileX;
+    auto x_j = x - x_i * TileX;
     IndexType output_ind =
         output_origin_block_flat_index + x_i * output_dims[2] + x_j;
     IndexType output_inc = BlockWriteRows * output_dims[2];
@@ -473,7 +473,7 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d,
     // can split input properly, in another words: num_wasted_threads=0.
     int num_full_tiles = input_long_edge / proposed_tile_long_edge;
 
-    int num_wasted_threads =
+    auto num_wasted_threads =
         input_long_edge - num_full_tiles * proposed_tile_long_edge;
 
     float cost = num_wasted_threads;
@@ -951,8 +951,8 @@ struct PermTypeClassifier {
           type_ = PermuteType::kGeneralTranspose;
           num_rows_tile_ = GET_TILE_SIZE(dims[rank - 2], kTileSize);
           int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
-          int tile_size = channel * num_rows_tile_ *
-                          GET_TILE_SIZE(dims[last_idx], kTileSize);
+          auto tile_size = channel * num_rows_tile_ *
+                           GET_TILE_SIZE(dims[last_idx], kTileSize);
           vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
         } else {
           type_ = PermuteType::kGeneralPermute;
@@ -970,7 +970,7 @@ struct PermTypeClassifier {
           num_rows_tile_ = GET_TILE_SIZE(dims[0], kTileSize);
 
           int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src);
-          int tile_size =
+          auto tile_size =
               dims[1] * num_rows_tile_ * GET_TILE_SIZE(dims[2], kTileSize);
           vec_size_ = tile_size < sm_count ? 1 : dim_vec_size;
         } else {
@@ -1232,7 +1232,7 @@ struct TransposeDataWriter {
         OutVecT tmp_data[ReadSize];
 #pragma unroll
         for (int i = 0; i < ReadSize; ++i) {
-          int tile_tail = tile_y * ReadSize + i;
+          auto tile_tail = tile_y * ReadSize + i;
           int major_share_idx = share_tile + tile_tail;
           IndexT row_in_mat = (blockIdx.x * kColTile + tile_tail) * col_stride;
 #pragma unroll
@@ -1266,7 +1266,8 @@ struct TransposeDataWriter<T, IndexT, ReadSize, 1> {
 #pragma unroll
       for (int tile_y = threadIdx.y; tile_y < cols_range;
            tile_y += kBlockRows) {
-        const int shared_major = shared_tile + tile_y * ReadSize;
+        const auto shared_major(shared_tile + tile_y * ReadSize);
+
         const IndexT row_major = (row_tile + tile_y * ReadSize) * col_stride;
 #pragma unroll
         for (int i = 0; i < ReadSize; ++i) {
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
index 709f22bec4ec3a..5272f532b53e25 100644
--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -54,8 +54,8 @@ inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
         continue;
       }
 
-      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
-                                        : squeeze_dims[i];
+      auto current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
+                                         : squeeze_dims[i];
 
       PADDLE_ENFORCE_GE(
           current,
@@ -118,7 +118,7 @@ inline DDim GetUnsqueezeShape(const std::vector<int64_t> unsqz_dims,
                                       UNSQUEEZE_MAX_RANK_SUPPORTED));
 
   for (int axis : unsqz_dims) {
-    int cur = axis < 0 ? axis + cur_output_rank + 1 : axis;
+    auto cur = axis < 0 ? axis + cur_output_rank + 1 : axis;
     // Validity Check: the axis bound
     PADDLE_ENFORCE_GE(
         cur,
diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc
index b3ffc6d822ef9f..3c3c8891a5e306 100644
--- a/paddle/phi/kernels/funcs/vol2col.cc
+++ b/paddle/phi/kernels/funcs/vol2col.cc
@@ -61,7 +61,7 @@ class Vol2ColFunctor<phi::CPUContext, T> {
     int64_t output_depth = col->dims()[4];
     int64_t output_height = col->dims()[5];
     int64_t output_width = col->dims()[6];
-    int channels_col =
+    auto channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
     // changed
@@ -187,7 +187,7 @@ class Col2VolFunctor<phi::CPUContext, T> {
     int output_depth = static_cast<int>(col.dims()[4]);
     int output_height = static_cast<int>(col.dims()[5]);
     int output_width = static_cast<int>(col.dims()[6]);
-    int channels_col =
+    auto channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
     bool paddings_size_is_6 = (paddings.size() == 6);
@@ -238,13 +238,13 @@ class Col2VolFunctor<phi::CPUContext, T> {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
       int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
+      auto cIm = c / filter_width / filter_height / filter_depth;
       for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        auto d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
         for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          auto h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
           for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+            auto w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
 
             if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
                 w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
@@ -259,7 +259,7 @@ class Col2VolFunctor<phi::CPUContext, T> {
                         input_channels +
                     cIm;
               }
-              int col_idx =
+              auto col_idx =
                   ((c * output_depth + d) * output_height + h) * output_width +
                   w;
               vol_data[vol_idx] += col_data[col_idx];
diff --git a/paddle/phi/kernels/funcs/vol2col.cu b/paddle/phi/kernels/funcs/vol2col.cu
index da81d027effc8e..176d3e1d167d8c 100644
--- a/paddle/phi/kernels/funcs/vol2col.cu
+++ b/paddle/phi/kernels/funcs/vol2col.cu
@@ -56,11 +56,11 @@ __global__ void vol2col(int64_t num_kernels,
     int w_out = index % output_width;
     int h_out = (index / output_width) % output_height;
     int d_out = (index / output_width / output_height) % output_detph;
-    int channel_in = index / output_width / output_height / output_detph;
-    int channel_out = channel_in * filter_depth * filter_height * filter_width;
-    int w_in = w_out * stride_width - padding_width;
-    int h_in = h_out * stride_height - padding_height;
-    int d_in = d_out * stride_depth - padding_depth;
+    auto channel_in = index / output_width / output_height / output_detph;
+    auto channel_out = channel_in * filter_depth * filter_height * filter_width;
+    auto w_in = w_out * stride_width - padding_width;
+    auto h_in = h_out * stride_height - padding_height;
+    auto d_in = d_out * stride_depth - padding_depth;
 
     data_col += ((static_cast<int64_t>(channel_out) * output_detph + d_out) *
                      output_height +
@@ -70,9 +70,9 @@ __global__ void vol2col(int64_t num_kernels,
     for (int k = 0; k < filter_depth; ++k) {
       for (int i = 0; i < filter_height; ++i) {
         for (int j = 0; j < filter_width; ++j) {
-          int d = d_in + k * dilation_d;
-          int h = h_in + i * dilation_h;
-          int w = w_in + j * dilation_w;
+          auto d = d_in + k * dilation_d;
+          auto h = h_in + i * dilation_h;
+          auto w = w_in + j * dilation_w;
           int64_t vol_idx;
           if (data_layout != DataLayout::kNHWC) {
             vol_idx =
@@ -242,11 +242,13 @@ __global__ void col2vol(int64_t num_kernels,
                         int output_width,
                         T* data_vol,
                         const DataLayout data_layout) {
-  const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
-  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
-  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+  const auto d_filter_depth(dilation_d * (filter_depth - 1) + 1);
 
-  int input_channels = num_kernels / depth / height / width;
+  const auto d_filter_height(dilation_h * (filter_height - 1) + 1);
+
+  const auto d_filter_width(dilation_w * (filter_width - 1) + 1);
+
+  auto input_channels = num_kernels / depth / height / width;
   for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
        index < num_kernels;
        index += blockDim.x * gridDim.x) {
@@ -264,22 +266,22 @@ __global__ void col2vol(int64_t num_kernels,
                                               : index % input_channels);
 
     // compute the start and end of the output
-    int w_col_start =
+    auto w_col_start =
         (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
     int w_col_end = min(w / stride_width + 1, output_width);
-    int h_col_start =
+    auto h_col_start =
         (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
     int h_col_end = min(h / stride_height + 1, output_height);
-    int d_col_start =
+    auto d_col_start =
         (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
     int d_col_end = min(d / stride_depth + 1, output_detph);
 
     for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
       for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
         for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          int d_off = (d - d_col * stride_depth);
-          int h_off = (h - h_col * stride_height);
-          int w_off = (w - w_col * stride_width);
+          auto d_off = (d - d_col * stride_depth);
+          auto h_off = (h - h_col * stride_height);
+          auto w_off = (w - w_col * stride_width);
           if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
               w_off % dilation_w == 0) {
             d_off /= dilation_d;
diff --git a/paddle/phi/kernels/funcs/weight_dequant_functor.h b/paddle/phi/kernels/funcs/weight_dequant_functor.h
index 7377cab0ac2db5..5d25b07316758c 100644
--- a/paddle/phi/kernels/funcs/weight_dequant_functor.h
+++ b/paddle/phi/kernels/funcs/weight_dequant_functor.h
@@ -140,7 +140,7 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight,
   // elements of the first four and last four threads of each 8 consecutive
   // threads will come from row 2N and row 2N+1 respectively before
   // interleaving.
-  int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0);
+  auto row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0);
   weight += tile_id * k * 2;
   output += row_id * k;
   float scale = static_cast<float>(scale_list[row_id]);
@@ -196,7 +196,7 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight,
   // elements of the first four and last four threads of each 8 consecutive
   // threads will come from row 2N and row 2N+1 respectively before
   // interleaving.
-  int row_id = tile_id * 4 + ((lane_id % 8) / 2);
+  auto row_id = tile_id * 4 + ((lane_id % 8) / 2);
   weight += tile_id * k / 2 * 4;
   output += row_id * k;
   float scale = static_cast<float>(scale_list[row_id]);
@@ -254,14 +254,14 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight,
   // elements of the first four and last four threads of each 8 consecutive
   // threads will come from row 2N and row 2N+1 respectively before
   // interleaving.
-  int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0);
+  auto row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0);
   weight += tile_id * k * 2;
   output += row_id * k;
 
   scales += row_id;
 #pragma unroll
   for (int i = lane_id * 16; i < k * 2; i += 16 * 32) {
-    int scale_offset = i / 2 / group_size;
+    auto scale_offset = i / 2 / group_size;
     float scale = static_cast<float>(scales[scale_offset * n]);
     Load<uint8_t, 16>(&weight[i], &vec_weight);
 #pragma unroll
@@ -314,14 +314,14 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight,
   // elements of the first four and last four threads of each 8 consecutive
   // threads will come from row 2N and row 2N+1 respectively before
   // interleaving.
-  int row_id = tile_id * 4 + ((lane_id % 8) / 2);
+  auto row_id = tile_id * 4 + ((lane_id % 8) / 2);
   weight += tile_id * k / 2 * 4;
   output += row_id * k;
   scales += row_id;
 #pragma unroll
   for (int i = lane_id * 32; i < k * 4; i += 32 * 32) {
     Load<uint8_t, 16>(&weight[i / 2], &vec_weight);
-    int scale_offset = i / 4 / group_size;
+    auto scale_offset = i / 4 / group_size;
     float scale = static_cast<float>(scales[scale_offset * n]);
 #pragma unroll
     for (int p = 0; p < 32; p += Converter::kHalfLength) {
diff --git a/paddle/phi/kernels/funcs/weight_only_gemv.cu b/paddle/phi/kernels/funcs/weight_only_gemv.cu
index 5cd1560694138a..1f87a0a8b2dd5b 100644
--- a/paddle/phi/kernels/funcs/weight_only_gemv.cu
+++ b/paddle/phi/kernels/funcs/weight_only_gemv.cu
@@ -316,7 +316,8 @@ __global__ void int8_weight_only_gemv(const T* input,
   const int warp_id = threadIdx.x / kWarpSize;
   const int lane_id = threadIdx.x % kWarpSize;
   const int tile_id = blockIdx.x * blockDim.x / kWarpSize + warp_id;
-  const int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0);
+  const auto row_id(tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0));
+
   weight += tile_id * k * 2;
 
   float v = 0.f, scale = static_cast<float>(scale_list[row_id]), v_bias;
@@ -922,7 +923,8 @@ __global__ void weight_only_batched_gemv_multi_warp(const T* in,
   constexpr int Num = Batch * NPerBlock;
   const int tid = threadIdx.x;
   const int bid = blockIdx.x;
-  const int n_start_id = bid * NPerBlock * Interleave;
+  const auto n_start_id(bid * NPerBlock * Interleave);
+
   using HALF_2_TYPE = typename CUDA_HALF_2_TYPE_TARIS<T>::type;
   // Calculate the n-dimensional index of the data processed by the current
   // thread in the interleave tile
@@ -1039,7 +1041,7 @@ __global__ void weight_only_batched_gemv_multi_warp(const T* in,
 #endif
       bias_v = ConvertFloatFunc<T>::apply(bias[n_start_id + nid]);
     }
-    int b = i / NPerBlock / Interleave;
+    auto b = i / NPerBlock / Interleave;
     out[b * n + n_start_id + nid] = ConvertDstFunc<T>::apply(
         GeluActivation<float, Gelu>::apply(v + bias_v));
   }
@@ -1066,7 +1068,7 @@ void select_activation_and_bias(const T* input,
   static constexpr int kInterleave = WeightLayoutDetails<QType>::kInterleave;
   dim3 grid(n / NPerBlock / kInterleave);
   dim3 block(BlockSize);
-  int size = sizeof(float) * BlockSize / 32 * Batch * NPerBlock * kInterleave;
+  auto size = sizeof(float) * BlockSize / 32 * Batch * NPerBlock * kInterleave;
   if (bias) {
     if (act_method == "gelu") {
       weight_only_batched_gemv_multi_warp<T,
diff --git a/paddle/phi/kernels/fusion/cpu/fused_embedding_fc_lstm_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_embedding_fc_lstm_kernel.cc
index 1d926b4553463c..18e939bfdf5ad4 100644
--- a/paddle/phi/kernels/fusion/cpu/fused_embedding_fc_lstm_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fused_embedding_fc_lstm_kernel.cc
@@ -355,7 +355,8 @@ class FusedEmbeddingFCLSTMKernel {
     }
     const auto& batch_starts = batched_lod[0];
     const int max_seq_len = static_cast<int>(batch_starts.size() - 1);
-    const int offset = tstart * max_bs * D;
+    const auto offset(tstart * max_bs * D);
+
     batched_input_data = batched_input_data + offset * 4;
     batched_h_out_data = batched_h_out_data + offset;
     batched_c_out_data = batched_c_out_data + offset;
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
index 4ac149b2deae27..72c548bbab8f7e 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
@@ -108,7 +108,7 @@ void SeqCompute(const Context& dev_ctx,
     hidden_out_data = hidden_out_data + gate_offset;
   };
   for (int i = 0; i < N; ++i) {
-    int bid = is_reverse ? N - 1 - i : i;
+    auto bid = is_reverse ? N - 1 - i : i;
     int seq_len = static_cast<int>(x_lod[0][bid + 1] - x_lod[0][bid]);
     const T* prev_hidden_data = nullptr;
     int tstart = 0;
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc
index c00b55f849d5e3..dadcedf2c27e19 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc
@@ -135,7 +135,7 @@ void SeqCompute(const Context &dev_ctx,
   }
 
   for (int i = 0; i < N; ++i) {
-    int bid = is_reverse ? N - 1 - i : i;
+    auto bid = is_reverse ? N - 1 - i : i;
     int seq_len = static_cast<int>(x_lod[0][bid + 1] - x_lod[0][bid]);
     const T *prev_c_data = nullptr;
     const T *prev_h_data = nullptr;
@@ -309,7 +309,8 @@ void BatchCompute(const Context &dev_ctx,
   // compute kernel part
   const auto &batch_starts = batched_lod[0];
   const int max_seq_len = static_cast<int>(batch_starts.size() - 1);
-  const int offset = tstart * max_bs * D;
+  const auto offset(tstart * max_bs * D);
+
   batched_input_data = batched_input_data + offset * 4;
   batched_h_out_data = batched_h_out_data + offset;
   batched_c_out_data = batched_c_out_data + offset;
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
index ecd868b872ad05..449b95f6eafdbe 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
@@ -75,7 +75,7 @@ void FusionSeqConvEltAddReluKernel(const Context& dev_ctx,
       // zero all up_pad and fill data
       std::memset(dst_data, 0, up_pad * col_mat_w_sz);
       dst_data = dst_data + up_pad * src_mat_w;
-      int copy_size = col_mat_w_sz - up_pad * src_mat_w_sz;
+      auto copy_size = col_mat_w_sz - up_pad * src_mat_w_sz;
       for (int j = 0; j < up_pad; ++j) {
         // blas.VCOPY?
         std::memcpy(dst_data, src_data, copy_size);
diff --git a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
index dff41e6d4250cb..7bf2a8ea970e65 100644
--- a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc
@@ -80,11 +80,11 @@ void transpose_before_bmm1(const T* qkvBuffer,
         const T* v_src_each_batch =
             reinterpret_cast<const T*>(vBuffer) + blocksize * 3 * i;
 
-        int dst_offset_each_bmHead = k * tokenSize * cols_per_bmHead;
+        auto dst_offset_each_bmHead = k * tokenSize * cols_per_bmHead;
         int src_offset_each_line = k * cols_per_bmHead;
 
         int dst_offset_each_line = j * cols_per_bmHead;
-        int src_offset_each_bmHead = j * hiddenSize * 3;
+        auto src_offset_each_bmHead = j * hiddenSize * 3;
 
         Tt* q_dst_each_line = q_buffer + i * blocksize +
                               dst_offset_each_bmHead + dst_offset_each_line;
@@ -131,7 +131,7 @@ void transpose_after_bmm2(T* Buffer,
       int dst_offset_each_line = k * hiddenSize;
 
       for (int j = 0; j < bmHead; j++) {
-        int src_offset_each_line = j * tokenSize * cols_per_bmHead;
+        auto src_offset_each_line = j * tokenSize * cols_per_bmHead;
         int dst_offset_each_head = j * cols_per_bmHead;
 
         Tt* q_dst_each_line = TransBuffer + dst_offset_each_head +
@@ -391,13 +391,13 @@ void scaled_dp_attention(const float* query,
 #else
         int tid = 0;
 #endif
-        int ooffset =
+        auto ooffset =
             i * num_head * otsize * head_size + j * otsize * head_size;
         const float* k = key + ooffset;
         const float* v = value + ooffset;
 
         int q_rblk = std::min(iblk, itsize - m);
-        int ioffset =
+        auto ioffset =
             i * num_head * otsize * head_size + j * otsize * head_size;
         const float* q = query + ioffset + m * head_size;
         float* out = output + ioffset + m * head_size;
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
index 6aed60cf1c23b6..c2c8fa31e5d5ee 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
@@ -67,10 +67,10 @@ __global__ void naive_conv2d_kernel(const T *input,
                                     const T *residual,
                                     float alpha,  // for leaky_relu
                                     OpType op_type) {
-  int M = batch * oh * ow;
+  auto M = batch * oh * ow;
   int N = oc;
   int kc = ic / groups;
-  int K = kc * kh * kw;
+  auto K = kc * kh * kw;
   int m_i = threadIdx.x + blockIdx.x * blockDim.x;
   int n_i = threadIdx.y + blockIdx.y * blockDim.y;
   if (m_i >= M || n_i >= N) return;
@@ -79,23 +79,23 @@ __global__ void naive_conv2d_kernel(const T *input,
   int oh_i = (m_i % (oh * ow)) / ow;
   int ow_i = (m_i % (oh * ow)) % ow;
   int oc_i = n_i;
-  int groups_i = (oc_i / (oc / groups));
+  auto groups_i = (oc_i / (oc / groups));
 
   struct logical_coord weight_shape = {oc, kc, kh, kw};
   struct logical_coord input_shape = {batch, ic, ih, iw};
-  int out_offset = m_i * N + n_i;
+  auto out_offset = m_i * N + n_i;
   float *out_ptr = output + out_offset;
   float sum = 0.f;
 
   for (int k_i = 0; k_i < K; k_i++) {
-    int ic_i = k_i / (kh * kw) + groups_i * kc;
+    auto ic_i = k_i / (kh * kw) + groups_i * kc;
     int kh_i = (k_i % (kh * kw)) / kw;
     int kw_i = (k_i % (kh * kw)) % kw;
 
     struct logical_coord weight_index = {oc_i, k_i / (kh * kw), kh_i, kw_i};
 
-    int ih_i = oh_i * stride_h - pad_h + kh_i * dilation_h;
-    int iw_i = ow_i * stride_w - pad_w + kw_i * dilation_w;
+    auto ih_i = oh_i * stride_h - pad_h + kh_i * dilation_h;
+    auto iw_i = ow_i * stride_w - pad_w + kw_i * dilation_w;
 
     if (ih_i < 0 || ih_i >= ih) continue;
     if (iw_i < 0 || iw_i >= iw) continue;
@@ -170,7 +170,7 @@ float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type, T a) {
 
   int oh = params.oh;
   int ow = params.ow;
-  int M = batch * oh * ow;
+  auto M = batch * oh * ow;
   int N = oc;
 
   constexpr int blockM = 16;
@@ -178,7 +178,7 @@ float conv2d_diff_gpu(const ConvAllParams &params, OpType op_type, T a) {
   uint3 grid = {(M + blockM - 1) / blockM, (N + blockN - 1) / blockN, 1};
   uint3 block = {blockM, blockN, 1};
 
-  int output_size = batch * oc * oh * ow;
+  auto output_size = batch * oc * oh * ow;
   T *output_from_cutlass =
       reinterpret_cast<T *>(malloc(sizeof(T) * output_size));
   cudaMemcpy(output_from_cutlass,
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
index de85ed672ed43b..527cc07b907266 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
@@ -263,12 +263,12 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int row_ptr_offset = row * ThreadMap::Delta::kRow * stride_ +
-                               group * ThreadMap::Delta::kGroup * stride_ +
-                               cluster * ThreadMap::Delta::kCluster * stride_ +
-                               pointer_offset / LoadType::kElements;
+          auto row_ptr_offset = row * ThreadMap::Delta::kRow * stride_ +
+                                group * ThreadMap::Delta::kGroup * stride_ +
+                                cluster * ThreadMap::Delta::kCluster * stride_ +
+                                pointer_offset / LoadType::kElements;
 
-          int frag_row_idx =
+          auto frag_row_idx =
               (row + ThreadMap::Iterations::kRow *
                          (group + ThreadMap::Iterations::kGroup * cluster));
 
@@ -277,13 +277,13 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn;
                ++column) {
-            int frag_idx =
+            auto frag_idx =
                 frag_row_idx * ThreadMap::Iterations::kColumn + column;
 
             CUTLASS_PRAGMA_UNROLL
             for (int v = 0; v < kLoadsPerAccess; ++v) {
-              int vector_idx = (column * ThreadMap::Delta::kColumn /
-                                kElementsPerAccess * kLoadsPerAccess);
+              auto vector_idx = (column * ThreadMap::Delta::kColumn /
+                                 kElementsPerAccess * kLoadsPerAccess);
 
               LoadType const* memory_pointer = pointers_[v] + row_ptr_offset;
 
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
index 77b3c294c5f1ff..73f9f28acca876 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -501,8 +501,9 @@ struct GemmFpAIntB {
           threadblock_tile_offset.m() * Mma::Shape::kM,
           threadblock_tile_offset.n() * Mma::Shape::kN);
 
-      int block_idx = threadblock_tile_offset.m() +
-                      threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+      auto block_idx =
+          threadblock_tile_offset.m() +
+          threadblock_tile_offset.n() * params.grid_tiled_shape.m();
 
       // Construct the semaphore.
       Semaphore semaphore(params.semaphore + block_idx, thread_idx);
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h
index 8ee14f87bdf51e..deb5be2213c260 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h
@@ -779,7 +779,7 @@ struct GemmFpAIntBSplitK {
 
     int iter_tile_first =
         reduce_tile_idx * params.block_mapping.iters_per_tile();
-    int iter_tile_last =
+    auto iter_tile_last =
         iter_tile_first + params.block_mapping.iters_per_tile() - 1;
 
     peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
index 2f41cf3386be07..8a16d3c4e7dcea 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
@@ -268,7 +268,7 @@ class MmaTensorOpComputeBWithF16 {
       for (int m = 0; m < MmaIterations::kRow; ++m) {
         int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
 
-        int n_offsetB = warp_tileB_k_offset + kExpansionFactor * n;
+        auto n_offsetB = warp_tileB_k_offset + kExpansionFactor * n;
         if (AccumulatorsInRowMajor) {  // matrix B is reordered
           mma(ptr_D[n + m_serpentine * MmaIterations::kColumn],
               ptr_A[m_serpentine],
@@ -290,7 +290,7 @@ class MmaTensorOpComputeBWithF16 {
       for (int n = 0; n < MmaIterations::kColumn; ++n) {
         int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
 
-        int n_serpentine_offsetB =
+        auto n_serpentine_offsetB =
             warp_tileB_k_offset + kExpansionFactor * n_serpentine;
         if (AccumulatorsInRowMajor) {  // matrix B is reordered
           mma(ptr_D[n_serpentine + m * MmaIterations::kColumn],
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
index e02e79316c460f..6e21d1c0b98799 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
@@ -469,7 +469,8 @@ class MmaTensorOpDequantizer<
                          const int warp_idx_n,
                          const int lane_idx) {
     const int warp_offset = warp_idx_n * Shape::kN;
-    const int base_col = lane_idx & 0xF8 + lane_idx % 4;
+    const auto base_col(lane_idx & 0xF8 + lane_idx % 4);
+
     const int thread_offset = warp_offset + base_col;
     pointer_ = smem_scales.data() + thread_offset;
   }
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
index ed6f32407f8f67..62fd113e725873 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -162,11 +162,11 @@ void generic_mixed_gemm_kernelLauncher(const T* A,
 
     using Gemm = cutlass::gemm::device::GemmUniversalBase<GemmKernel>;
 
-    const int ldb =
+    const auto ldb(
         cutlass::platform::is_same<cutlass::layout::RowMajor,
                                    typename MixedGemmArchTraits::LayoutB>::value
             ? n
-            : k * GemmKernel::kInterleave;
+            : k * GemmKernel::kInterleave);
 
     typename Gemm::Arguments args(
         {m, n, k},
@@ -272,11 +272,12 @@ void generic_mixed_gemm_kernelLauncher(const T* A,
 
     using Gemm = cutlass::gemm::device::GemmUniversalBase<GemmKernel>;
 
-    const int ldb =
+    const auto ldb(
         cutlass::platform::is_same<cutlass::layout::RowMajor,
                                    typename MixedGemmArchTraits::LayoutB>::value
             ? n
-            : k * GemmKernel::kInterleave;
+            : k * GemmKernel::kInterleave);
+
     typename Gemm::Arguments args(
         cutlass::gemm::GemmUniversalMode::kGemm,
         {m, n, k},
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h
index 8a491ed727c0ea..dc8a10fbda91af 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h
@@ -605,10 +605,10 @@ class EpiloguePipelined : public EpilogueBase<Shape_,
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int row_offset = row * ThreadMap::Delta::kRow +
-                           group * ThreadMap::Delta::kGroup +
-                           cluster * ThreadMap::Delta::kCluster;
-          int frag_row_idx =
+          auto row_offset = row * ThreadMap::Delta::kRow +
+                            group * ThreadMap::Delta::kGroup +
+                            cluster * ThreadMap::Delta::kCluster;
+          auto frag_row_idx =
               (row + ThreadMap::Iterations::kRow *
                          (group + ThreadMap::Iterations::kGroup * cluster));
           CUTLASS_PRAGMA_UNROLL
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/attention_scaling_coefs_updater.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/attention_scaling_coefs_updater.h
index 5a52fc846d9b08..2fee6cd402d8e5 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/attention_scaling_coefs_updater.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/attention_scaling_coefs_updater.h
@@ -156,7 +156,7 @@ struct RegisterOps {
           });
       __syncthreads();
       if (lane_id < kLinesPerWarp) {
-        int id = warp_id * kLinesPerWarp + lane_id;
+        auto id = warp_id * kLinesPerWarp + lane_id;
         total_row = s_prime[id];
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < kWarpN; ++i) {
@@ -208,19 +208,19 @@ struct AttentionScalingCoefsUpdaterSm80
     for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int row = 0; row < kAccumulatorRows; ++row) {
-        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                      row * kRowsPerTile + lane_offset.row();
+        auto accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                       row * kRowsPerTile + lane_offset.row();
         beginRow(accum_m);
 
         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
-                                (mma_n * Policy::MmaIterations::kRow + mma_m);
+          auto mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+                                 (mma_n * Policy::MmaIterations::kRow + mma_m);
           CUTLASS_PRAGMA_UNROLL
           for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
-                          col + lane_offset.column();
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            auto accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                           col + lane_offset.column();
+            auto idx = mma_accum_start + row * kElementsPerAccess + col;
             op(accum_m, accum_n, idx);
           }
         }
@@ -324,9 +324,9 @@ struct AttentionScalingCoefsUpdaterVolta
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
         CUTLASS_PRAGMA_UNROLL
         for (int m = 0; m < EleShapePerPartial::kRow; ++m) {
-          int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                        mma_m * QuadShapePerPartialMma::kRow + m * 2 +
-                        lane_offset.row();
+          auto accum_m = tile_m * Policy::InterleavedTile::kRow +
+                         mma_m * QuadShapePerPartialMma::kRow + m * 2 +
+                         lane_offset.row();
           beginRow(accum_m);
 
           CUTLASS_PRAGMA_UNROLL
@@ -346,12 +346,12 @@ struct AttentionScalingCoefsUpdaterVolta
                            Policy::MmaIterations::kRow +
                        mma_m) *
                       kElementsPerMma;
-                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
-                                mma_n * QuadShapePerPartialMma::kColumn +
-                                p * Policy::InterleavedTile::kColumn / 2 + n +
-                                lane_offset.column();
-                  int idx = mma_accum_start + p * kElementsPerPartial +
-                            m * EleShapePerPartial::kColumn + n;
+                  auto accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                                 mma_n * QuadShapePerPartialMma::kColumn +
+                                 p * Policy::InterleavedTile::kColumn / 2 + n +
+                                 lane_offset.column();
+                  auto idx = mma_accum_start + p * kElementsPerPartial +
+                             m * EleShapePerPartial::kColumn + n;
                   op(accum_m, accum_n, idx);
                 }
               }
@@ -401,17 +401,17 @@ struct AttentionScalingCoefsUpdaterSimt
     for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
-        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        auto accum_m = mma_m * Delta::kRow + m + lane_offset.row();
         beginRow(accum_m);
 
         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
-          int accum_n =
+          auto accum_n =
               mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
               lane_offset.column();
           CUTLASS_PRAGMA_UNROLL
           for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
-            int idx =
+            auto idx =
                 n + Policy::LaneMmaShape::kN *
                         (mma_n + Iterations::kColumn *
                                      (m + mma_m * Policy::LaneMmaShape::kM));
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_accum_lambda_iterator.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_accum_lambda_iterator.h
index 661c0c0327a15f..b256f45dba84bb 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_accum_lambda_iterator.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_accum_lambda_iterator.h
@@ -67,19 +67,19 @@ struct AccumLambdaIteratorSm80 {
     for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int row = 0; row < kAccumulatorRows; ++row) {
-        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
-                      row * kRowsPerTile + lane_offset.row();
+        auto accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                       row * kRowsPerTile + lane_offset.row();
         beginRow(accum_m);
 
         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
-          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
-                                (mma_n * Policy::MmaIterations::kRow + mma_m);
+          auto mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+                                 (mma_n * Policy::MmaIterations::kRow + mma_m);
           CUTLASS_PRAGMA_UNROLL
           for (int col = 0; col < kElementsPerAccess; ++col) {
-            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
-                          col + lane_offset.column();
-            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            auto accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                           col + lane_offset.column();
+            auto idx = mma_accum_start + row * kElementsPerAccess + col;
             op(accum_m, accum_n, idx);
           }
         }
@@ -179,9 +179,9 @@ struct AccumLambdaIteratorSm70 {
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
         CUTLASS_PRAGMA_UNROLL
         for (int m = 0; m < EleShapePerPartial::kRow; ++m) {
-          int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                        mma_m * QuadShapePerPartialMma::kRow + m * 2 +
-                        lane_offset.row();
+          auto accum_m = tile_m * Policy::InterleavedTile::kRow +
+                         mma_m * QuadShapePerPartialMma::kRow + m * 2 +
+                         lane_offset.row();
           beginRow(accum_m);
 
           CUTLASS_PRAGMA_UNROLL
@@ -201,12 +201,12 @@ struct AccumLambdaIteratorSm70 {
                            Policy::MmaIterations::kRow +
                        mma_m) *
                       kElementsPerMma;
-                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
-                                mma_n * QuadShapePerPartialMma::kColumn +
-                                p * Policy::InterleavedTile::kColumn / 2 + n +
-                                lane_offset.column();
-                  int idx = mma_accum_start + p * kElementsPerPartial +
-                            m * EleShapePerPartial::kColumn + n;
+                  auto accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                                 mma_n * QuadShapePerPartialMma::kColumn +
+                                 p * Policy::InterleavedTile::kColumn / 2 + n +
+                                 lane_offset.column();
+                  auto idx = mma_accum_start + p * kElementsPerPartial +
+                             m * EleShapePerPartial::kColumn + n;
                   op(accum_m, accum_n, idx);
                 }
               }
@@ -252,17 +252,17 @@ struct AccumLambdaIteratorSimt {
     for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
-        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        auto accum_m = mma_m * Delta::kRow + m + lane_offset.row();
         beginRow(accum_m);
 
         CUTLASS_PRAGMA_UNROLL
         for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
-          int accum_n =
+          auto accum_n =
               mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
               lane_offset.column();
           CUTLASS_PRAGMA_UNROLL
           for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
-            int idx =
+            auto idx =
                 n + Policy::LaneMmaShape::kN *
                         (mma_n + Iterations::kColumn *
                                      (m + mma_m * Policy::LaneMmaShape::kM));
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_from_smem.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_from_smem.h
index 05512b4c4a5c49..4db8f71d4d0f7c 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_from_smem.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_from_smem.h
@@ -1812,21 +1812,21 @@ struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
             for (int p = 0; p < kAccumulatorPartials; ++p) {
               CUTLASS_PRAGMA_UNROLL
               for (int m = 0; m < EleShapePerPartial::kRow; ++m) {
-                int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                              mma_m * QuadShapePerPartialMma::kRow + m * 2;
-                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
-                              mma_n * QuadShapePerPartialMma::kColumn +
-                              p * Policy::InterleavedTile::kColumn / 2;
-                int r = (accum_m + lane_offset.row());
+                auto accum_m = tile_m * Policy::InterleavedTile::kRow +
+                               mma_m * QuadShapePerPartialMma::kRow + m * 2;
+                auto accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                               mma_n * QuadShapePerPartialMma::kColumn +
+                               p * Policy::InterleavedTile::kColumn / 2;
+                auto r = (accum_m + lane_offset.row());
                 AccessType to_store;
                 CUTLASS_PRAGMA_UNROLL
                 for (int n = 0; n < EleShapePerPartial::kColumn; ++n) {
-                  int idx = mma_accum_start + p * kElementsPerPartial +
-                            m * EleShapePerPartial::kColumn + n;
-                  int c = (accum_n + n + lane_offset.column());
+                  auto idx = mma_accum_start + p * kElementsPerPartial +
+                             m * EleShapePerPartial::kColumn + n;
+                  auto c = (accum_n + n + lane_offset.column());
                   to_store[n] = scalar_t(accum[idx]);
                 }
-                int c = (accum_n + lane_offset.column());
+                auto c = (accum_n + lane_offset.column());
                 assert(r < 32);
                 assert(c < 32);
                 *reinterpret_cast<AccessType*>(ref_.data() +
@@ -1961,11 +1961,11 @@ struct B2bGemm<
         for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
           CUTLASS_PRAGMA_UNROLL
           for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
-            int r =
+            auto r =
                 Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
                 m;
-            int c = mma_n * Delta::kColumn + n;
-            int idx =
+            auto c = mma_n * Delta::kColumn + n;
+            auto idx =
                 n + Policy::LaneMmaShape::kN *
                         (mma_n + Iterations::kColumn *
                                      (m + mma_m * Policy::LaneMmaShape::kM));
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h
index 9ce029c61733e5..f82d036bde2b97 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -309,9 +309,9 @@ class PredicatedTileIteratorPrefetch {
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int row_offset = row * ThreadMap::Delta::kRow +
-                           group * ThreadMap::Delta::kGroup +
-                           cluster * ThreadMap::Delta::kCluster;
+          auto row_offset = row * ThreadMap::Delta::kRow +
+                            group * ThreadMap::Delta::kGroup +
+                            cluster * ThreadMap::Delta::kCluster;
 
           AccessType* memory_pointer =
               reinterpret_cast<AccessType*>(byte_pointer);
@@ -360,13 +360,13 @@ class PredicatedTileIteratorPrefetch {
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int frag_row_idx =
+          auto frag_row_idx =
               (row + ThreadMap::Iterations::kRow *
                          (group + ThreadMap::Iterations::kGroup * cluster));
 
-          int row_offset = row * ThreadMap::Delta::kRow +
-                           group * ThreadMap::Delta::kGroup +
-                           cluster * ThreadMap::Delta::kCluster;
+          auto row_offset = row * ThreadMap::Delta::kRow +
+                            group * ThreadMap::Delta::kGroup +
+                            cluster * ThreadMap::Delta::kCluster;
 
           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
 
@@ -431,13 +431,13 @@ class PredicatedTileIteratorPrefetch {
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int frag_row_idx =
+          auto frag_row_idx =
               (row + ThreadMap::Iterations::kRow *
                          (group + ThreadMap::Iterations::kGroup * cluster));
 
-          int row_offset = row * ThreadMap::Delta::kRow +
-                           group * ThreadMap::Delta::kGroup +
-                           cluster * ThreadMap::Delta::kCluster;
+          auto row_offset = row * ThreadMap::Delta::kRow +
+                            group * ThreadMap::Delta::kGroup +
+                            cluster * ThreadMap::Delta::kCluster;
 
           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
 
@@ -517,13 +517,13 @@ class PredicatedTileIteratorPrefetch {
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int frag_row_idx =
+          auto frag_row_idx =
               (row + ThreadMap::Iterations::kRow *
                          (group + ThreadMap::Iterations::kGroup * cluster));
 
-          int row_offset = row * ThreadMap::Delta::kRow +
-                           group * ThreadMap::Delta::kGroup +
-                           cluster * ThreadMap::Delta::kCluster;
+          auto row_offset = row * ThreadMap::Delta::kRow +
+                            group * ThreadMap::Delta::kGroup +
+                            cluster * ThreadMap::Delta::kCluster;
 
           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
 
@@ -533,9 +533,9 @@ class PredicatedTileIteratorPrefetch {
           int output_P = output_PQ / convolution_Q;
           int output_Q = output_PQ % convolution_Q;
 
-          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
-                          (2 * output_P + add_P) * 2 * convolution_Q +
-                          2 * output_Q + add_Q;
+          auto input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+                           (2 * output_P + add_P) * 2 * convolution_Q +
+                           2 * output_Q + add_Q;
 
           int64_t byte_offset =
               (input_row - output_row) * problem_N * sizeof(float);
@@ -592,13 +592,13 @@ class PredicatedTileIteratorPrefetch {
       for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
         CUTLASS_PRAGMA_UNROLL
         for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-          int frag_row_idx =
+          auto frag_row_idx =
               (row + ThreadMap::Iterations::kRow *
                          (group + ThreadMap::Iterations::kGroup * cluster));
 
-          int row_offset = row * ThreadMap::Delta::kRow +
-                           group * ThreadMap::Delta::kGroup +
-                           cluster * ThreadMap::Delta::kCluster;
+          auto row_offset = row * ThreadMap::Delta::kRow +
+                            group * ThreadMap::Delta::kGroup +
+                            cluster * ThreadMap::Delta::kCluster;
 
           bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
 
@@ -612,9 +612,10 @@ class PredicatedTileIteratorPrefetch {
           if (output_P > convolution_P - 2) row_add_P = 0;
           if (output_Q > convolution_Q - 2) row_add_Q = 0;
 
-          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
-                          ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
-                          (output_Q + row_add_Q) / 2;
+          auto input_row =
+              output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
 
           int64_t byte_offset =
               (input_row - output_row) * problem_N * sizeof(float);
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h
index 0ede20eaec1100..2450dc835f7c16 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -336,7 +336,7 @@ class PredicatedTileAccessIteratorResidualLast<Shape_,
                                         (ThreadMap::Delta::kContiguous *
                                          sizeof_bits<Element>::value / 8) +
                                     the_predicates.iteration_vector_;
-      int strided_index =
+      auto strided_index =
           gather_offset_strided +
           the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
 
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h
index 6e51ede94d11e9..c9b9a8d18446f5 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -382,8 +382,8 @@ class PredicatedTileIteratorResidualLast<Shape_,
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v + kAccessesPerVector *
-                            (c + s * ThreadMap::Iterations::kContiguous);
+          auto idx = v + kAccessesPerVector *
+                             (c + s * ThreadMap::Iterations::kContiguous);
 
           address_iterator_.set_iteration_index(idx);
           char const* byte_ptr =
@@ -425,8 +425,8 @@ class PredicatedTileIteratorResidualLast<Shape_,
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v + kAccessesPerVector *
-                            (c + s * ThreadMap::Iterations::kContiguous);
+          auto idx = v + kAccessesPerVector *
+                             (c + s * ThreadMap::Iterations::kContiguous);
 
           char* byte_ptr =
               reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
@@ -1071,8 +1071,8 @@ class PredicatedTileIteratorResidualLast<Shape_,
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v + kAccessesPerVector *
-                            (c + s * ThreadMap::Iterations::kContiguous);
+          auto idx = v + kAccessesPerVector *
+                             (c + s * ThreadMap::Iterations::kContiguous);
 
           address_iterator_.set_iteration_index(idx);
           char const* byte_ptr =
@@ -1114,8 +1114,8 @@ class PredicatedTileIteratorResidualLast<Shape_,
       for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
         CUTLASS_PRAGMA_UNROLL
         for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v + kAccessesPerVector *
-                            (c + s * ThreadMap::Iterations::kContiguous);
+          auto idx = v + kAccessesPerVector *
+                             (c + s * ThreadMap::Iterations::kContiguous);
 
           char* byte_ptr =
               reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h
index fc3a8317ab70a6..9e9bdd5f97290f 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h
@@ -186,7 +186,7 @@ class WarpIteratorFromSmem {
           CUTLASS_PRAGMA_UNROLL
           for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction;
                ++access_m_idx) {
-            int access_idx =
+            auto access_idx =
                 access_m_idx + kTilesPerInstruction *
                                    (inner_idx + kAccessesInner * inst_m_idx);
 
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
index df1edc71866203..828927db0d8fd0 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
@@ -1164,12 +1164,12 @@ struct AttentionBackwardKernel {
     }
 
     int32_t key_start = 0;
-    int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
+    auto key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
     for (; key_start < key_end; key_start += kBlockSizeJ) {
       output_frags.clear();
       int32_t query_start = getQueryStart(p, key_start);
-      int32_t query_end = query_start + (p.num_queries - query_start) /
-                                            kBlockSizeI * kBlockSizeI;
+      auto query_end = query_start + (p.num_queries - query_start) /
+                                         kBlockSizeI * kBlockSizeI;
       for (; query_start < query_end; query_start += kBlockSizeI) {
         processBlockIJ<true>(shared_storage,
                              output_frags,
@@ -1243,7 +1243,7 @@ struct AttentionBackwardKernel {
 
     CUTLASS_PRAGMA_UNROLL
     for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) {
-      int key = key_start + j + (thread_id / kThreadsPerKey);
+      auto key = key_start + j + (thread_id / kThreadsPerKey);
       if (!skipBoundsChecks && key >= p.num_keys) {
         continue;
       }
@@ -1781,7 +1781,7 @@ struct AttentionBackwardKernel {
 
       bool isFirst = key_start == 0;
       int col_id = col / MatmulGradQ::ThreadblockShape::kN;
-      int storage_id =
+      auto storage_id =
           (col_id +
            query_start / kBlockSizeI *
                ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN));
diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h
index 9b27233f5dff1d..a23b01e685138b 100644
--- a/paddle/phi/kernels/fusion/gpu/block_attn.h
+++ b/paddle/phi/kernels/fusion/gpu/block_attn.h
@@ -120,8 +120,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
 
   act_time_step += params.pre_cache_length;
 
-  const int *block_table =
-      params.block_tables + bi * params.max_num_blocks_per_seq;
+  const auto *block_table(params.block_tables +
+                          bi * params.max_num_blocks_per_seq);
 
   typedef PDDataTypeTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
@@ -134,7 +134,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
 
   extern __shared__ char smem_[];
 
-  int block_smem_offset =
+  auto block_smem_offset =
       div_up(params.max_num_blocks_per_seq, 4) * 4 * sizeof(int);
   float *qk_smem = reinterpret_cast<float *>(smem_ + block_smem_offset);
 
@@ -170,10 +170,13 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
     v_dequant_scale = static_cast<float>(params.cache_v_dequant_scales[kv_hi]);
   }
 
-  const int bhi = bi * params.q_num_head + hi;
-  const int ti =
-      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1;
-  const int thi = params.cum_offsets ? ti * params.q_num_head + hi : -1;
+  const auto bhi(bi * params.q_num_head + hi);
+
+  const auto ti(
+      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1);
+
+  const auto thi(params.cum_offsets ? ti * params.q_num_head + hi : -1);
+
   int *block_table_smem = reinterpret_cast<int *>(smem_);
 
   for (int local_id = tid; local_id < params.max_num_blocks_per_seq;
@@ -190,12 +193,12 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
   const int physical_block_number = block_table_smem[block_idx];
 
   // cache offset of current token
-  const int base_cache_offset =
-      physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh +
-      kv_hi * BLOCK_SIZE * Dh + block_offset * Dh;
+  const auto base_cache_offset(physical_block_number * params.kv_num_head *
+                                   BLOCK_SIZE * Dh +
+                               kv_hi * BLOCK_SIZE * Dh + block_offset * Dh);
 
   // qkv [B, S=1, num_head + 2 * (kv_num_head), head_dim]
-  int qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh;
+  auto qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh;
 
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
   static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
@@ -218,7 +221,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
   // Load the current timestep's Q and K, then compute q*k,
   // with each block computing one head.
   if (tid < QK_VECS_PER_WARP) {
-    const int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    const auto qk_offset(qkv_base_offset + tid * QK_VEC_SIZE);
 
     Qk_vec q;
     zero(q);
@@ -234,8 +237,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
     }
 
     if (params.add_qkv_bias) {
-      const int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
-      const int k_bias_offset = kv_hi * Dh + tid * QK_VEC_SIZE;
+      const auto q_bias_offset(hi * Dh + tid * QK_VEC_SIZE);
+
+      const auto k_bias_offset(kv_hi * Dh + tid * QK_VEC_SIZE);
+
       Qk_vec q_bias;
       zero(q_bias);
       Qk_vec k_bias;
@@ -266,16 +271,17 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
       } else {
         int last_dim = Dh / params.rotary_emb_dims;
         int half_lastdim = last_dim / 2;
-        int rotary_offset = act_time_step * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = act_time_step * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.rope_stride;
         int stride = half_lastdim / QK_VEC_SIZE;
         int stride_all_lastdim = 2 * stride;
-        int right_id = tid / stride_all_lastdim * stride_all_lastdim +
-                       (tid + stride) % (stride_all_lastdim);
-        int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
-        int k_right_offset = qkv_base_offset + params.q_num_head * Dh +
-                             kv_hi * Dh + right_id * QK_VEC_SIZE;
+        auto right_id = tid / stride_all_lastdim * stride_all_lastdim +
+                        (tid + stride) % (stride_all_lastdim);
+        auto q_right_offset =
+            qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
+        auto k_right_offset = qkv_base_offset + params.q_num_head * Dh +
+                              kv_hi * Dh + right_id * QK_VEC_SIZE;
 
         Qk_vec q_right;
         zero(q_right);
@@ -313,14 +319,16 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
     *reinterpret_cast<Qk_vec *>(&q_smem[tid * QK_VEC_SIZE]) = q;
     if (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh) {
       if (CACHE_TYPE == CacheType::INT8) {
-        const int offset = base_cache_offset + tid * QK_VEC_SIZE;
+        const auto offset(base_cache_offset + tid * QK_VEC_SIZE);
+
         QK_Packed_Int8_t k_tmp =
             round_tmp<QK_Packed_Int8_t, Qk_vec, CACHE_TYPE>(
                 mul<Qk_vec, float, Qk_vec>(k_quant_scale, k));
         *reinterpret_cast<QK_Packed_Int8_t *>(&params.k_cache_I[offset]) =
             k_tmp;
       } else {
-        const int offset = base_cache_offset + tid * QK_VEC_SIZE;
+        const auto offset(base_cache_offset + tid * QK_VEC_SIZE);
+
         *reinterpret_cast<Qk_vec *>(&params.k_cache[offset]) = k;
       }
     }
@@ -390,9 +398,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
   for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
     const int physical_block_number = block_table_smem[ti / BLOCK_SIZE];
     const int block_offset = ti % BLOCK_SIZE;
-    const int k_offset =
-        physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh +
-        kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki;
+    const auto k_offset(physical_block_number * params.kv_num_head *
+                            BLOCK_SIZE * Dh +
+                        kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki);
+
 #pragma unroll
     for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
       if (ti < act_time_step) {
@@ -509,9 +518,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel(
       int physical_block_number = block_table_smem[ti / BLOCK_SIZE];
 
       const int block_offset = ti % BLOCK_SIZE;
-      const int v_offset =
-          physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh +
-          kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi;
+      const auto v_offset(physical_block_number * params.kv_num_head *
+                              BLOCK_SIZE * Dh +
+                          kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi);
+
       V_vec v;
       if (CACHE_TYPE == CacheType::INT8) {
         mul_pointer_v2<V_vec, float, V_Packed_Int8_t, CACHE_TYPE>(
@@ -627,8 +637,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
     return;
   }
 
-  const int *block_table =
-      params.block_tables + bi * params.max_num_blocks_per_seq;
+  const auto *block_table(params.block_tables +
+                          bi * params.max_num_blocks_per_seq);
 
   typedef PDDataTypeTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
@@ -641,9 +651,9 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
 
   extern __shared__ char smem_[];
 
-  int block_smem_offset =
+  auto block_smem_offset =
       div_up(params.max_num_blocks_per_seq, 4) * 4 * sizeof(int);
-  int q_smem_offset =
+  auto q_smem_offset =
       div_up(Dh_MAX * GQA_SUB_PARTITION_SIZE, 4) * 4 * sizeof(T);
 
   T *q_smem = reinterpret_cast<T *>(smem_ + block_smem_offset);
@@ -678,8 +688,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
     v_dequant_scale = static_cast<float>(params.cache_v_dequant_scales[kv_hi]);
   }
 
-  const int ti =
-      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1;
+  const auto ti(
+      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1);
 
   int *block_table_smem = reinterpret_cast<int *>(smem_);
 
@@ -698,12 +708,12 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
   const int physical_block_number = block_table_smem[block_idx];
 
   // cache offset of current token
-  const int base_cache_offset =
-      physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh +
-      kv_hi * BLOCK_SIZE * Dh + block_offset * Dh;
+  const auto base_cache_offset(physical_block_number * params.kv_num_head *
+                                   BLOCK_SIZE * Dh +
+                               kv_hi * BLOCK_SIZE * Dh + block_offset * Dh);
 
   // qkv [B, S=1, num_head + 2 * (kv_num_head), head_dim]
-  int qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh;
+  auto qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh;
 
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
   static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
@@ -730,9 +740,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
   const int lane_id = tid % WARP_SIZE;
 
   if (warp_id < GQA_SUB_PARTITION_SIZE && lane_id < QK_VECS_PER_WARP) {
-    const int hi = kv_hi * GQA_PARTITION_SIZE +
-                   gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id;
-    const int qk_offset = qkv_base_offset + lane_id * QK_VEC_SIZE;
+    const auto hi(kv_hi * GQA_PARTITION_SIZE +
+                  gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id);
+
+    const auto qk_offset(qkv_base_offset + lane_id * QK_VEC_SIZE);
 
     Qk_vec q;
     zero(q);
@@ -748,8 +759,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
     }
 
     if (params.add_qkv_bias) {
-      const int q_bias_offset = hi * Dh + lane_id * QK_VEC_SIZE;
-      const int k_bias_offset = kv_hi * Dh + lane_id * QK_VEC_SIZE;
+      const auto q_bias_offset(hi * Dh + lane_id * QK_VEC_SIZE);
+
+      const auto k_bias_offset(kv_hi * Dh + lane_id * QK_VEC_SIZE);
+
       Qk_vec q_bias;
       zero(q_bias);
       Qk_vec k_bias;
@@ -780,16 +793,17 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
       } else {
         int last_dim = Dh / params.rotary_emb_dims;
         int half_lastdim = last_dim / 2;
-        int rotary_offset = act_time_step * Dh + lane_id * QK_VEC_SIZE;
+        auto rotary_offset = act_time_step * Dh + lane_id * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.rope_stride;
         int stride = half_lastdim / QK_VEC_SIZE;
         int stride_all_lastdim = 2 * stride;
-        int right_id = lane_id / stride_all_lastdim * stride_all_lastdim +
-                       (lane_id + stride) % (stride_all_lastdim);
-        int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
-        int k_right_offset = qkv_base_offset + params.q_num_head * Dh +
-                             kv_hi * Dh + right_id * QK_VEC_SIZE;
+        auto right_id = lane_id / stride_all_lastdim * stride_all_lastdim +
+                        (lane_id + stride) % (stride_all_lastdim);
+        auto q_right_offset =
+            qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
+        auto k_right_offset = qkv_base_offset + params.q_num_head * Dh +
+                              kv_hi * Dh + right_id * QK_VEC_SIZE;
 
         Qk_vec q_right;
         zero(q_right);
@@ -829,14 +843,16 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
 
     if (Dh == Dh_MAX || lane_id * QK_VEC_SIZE < Dh) {
       if (CACHE_TYPE == CacheType::INT8) {
-        const int offset = base_cache_offset + lane_id * QK_VEC_SIZE;
+        const auto offset(base_cache_offset + lane_id * QK_VEC_SIZE);
+
         QK_Packed_Int8_t k_tmp =
             round_tmp<QK_Packed_Int8_t, Qk_vec, CACHE_TYPE>(
                 mul<Qk_vec, float, Qk_vec>(k_quant_scale, k));
         *reinterpret_cast<QK_Packed_Int8_t *>(&params.k_cache_I[offset]) =
             k_tmp;
       } else {
-        const int offset = base_cache_offset + lane_id * QK_VEC_SIZE;
+        const auto offset(base_cache_offset + lane_id * QK_VEC_SIZE);
+
         *reinterpret_cast<Qk_vec *>(&params.k_cache[offset]) = k;
       }
     }
@@ -857,9 +873,11 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
 
   if (lane_id == 0) {
     qk *= params.inv_sqrt_dh;
-    const int hi = kv_hi * GQA_PARTITION_SIZE +
-                   gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id;
-    const int bhi = bi * params.q_num_head + hi;
+    const auto hi(kv_hi * GQA_PARTITION_SIZE +
+                  gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id);
+
+    const auto bhi(bi * params.q_num_head + hi);
+
     if (params.attn_mask) {
       auto mask_bhi = bhi;
       if (params.mask_broadcast_num_heads) {
@@ -915,9 +933,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
     int physical_block_number = block_table_smem[ti / BLOCK_SIZE];
 
     const int block_offset = ti % BLOCK_SIZE;
-    const int k_offset =
-        physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh +
-        kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki;
+    const auto k_offset(physical_block_number * params.kv_num_head *
+                            BLOCK_SIZE * Dh +
+                        kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki);
+
 #pragma unroll
     for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
       if (ti < act_time_step) {
@@ -940,8 +959,9 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
     }
 #pragma unroll
     for (int local_hi = 0; local_hi < GQA_SUB_PARTITION_SIZE; local_hi++) {
-      const int hi = kv_hi * GQA_PARTITION_SIZE +
-                     gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi;
+      const auto hi(kv_hi * GQA_PARTITION_SIZE +
+                    gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi);
+
       K_vec q[K_VECS_PER_THREAD];
 #pragma unroll
       for (int i = 0; i < K_VECS_PER_THREAD; ++i) {
@@ -952,7 +972,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
       float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k, params.inv_sqrt_dh);
 
       if (params.attn_mask) {
-        const int bhi = bi * params.q_num_head + hi;
+        const auto bhi(bi * params.q_num_head + hi);
+
         auto mask_bhi = bhi;
         if (params.mask_broadcast_num_heads) {
           mask_bhi = bi;
@@ -1058,9 +1079,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
       int physical_block_number = block_table_smem[ti / BLOCK_SIZE];
 
       const int block_offset = ti % BLOCK_SIZE;
-      const int v_offset =
-          physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh +
-          kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi;
+      const auto v_offset(physical_block_number * params.kv_num_head *
+                              BLOCK_SIZE * Dh +
+                          kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi);
+
       V_vec v;
       if (CACHE_TYPE == CacheType::INT8) {
         mul_pointer_v2<V_vec, float, V_Packed_Int8_t, CACHE_TYPE>(
@@ -1160,13 +1182,17 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
   if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
 #pragma unroll
     for (int local_hi = 0; local_hi < GQA_SUB_PARTITION_SIZE; local_hi++) {
-      const int hi = kv_hi * GQA_PARTITION_SIZE +
-                     gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi;
-      const int ti = params.cum_offsets
-                         ? bi * params.seq_len - params.cum_offsets[bi]
-                         : -1;
-      const int thi = params.cum_offsets ? ti * params.q_num_head + hi : -1;
-      const int bhi = bi * params.q_num_head + hi;
+      const auto hi(kv_hi * GQA_PARTITION_SIZE +
+                    gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi);
+
+      const auto ti(params.cum_offsets
+                        ? bi * params.seq_len - params.cum_offsets[bi]
+                        : -1);
+
+      const auto thi(params.cum_offsets ? ti * params.q_num_head + hi : -1);
+
+      const auto bhi(bi * params.q_num_head + hi);
+
 #ifdef MMHA_USE_FP32_ACUM_FOR_OUT
       V_vec tmp_out;
       convert_from_float(tmp_out, out[local_hi]);
@@ -1905,7 +1931,8 @@ __global__ void cache_int8_kernel(
     if (seq_lens[ori_bi] == 0) continue;
     const uint32_t ori_seq_id = ori_token_idx % max_seq_len + pre_cache_length;
 
-    const int32_t *block_table_now = block_tables + ori_bi * max_blocks_per_seq;
+    const auto *block_table_now(block_tables + ori_bi * max_blocks_per_seq);
+
     const uint32_t block_idx = block_table_now[ori_seq_id / block_size];
     const uint32_t block_offset = ori_seq_id % block_size;
 
@@ -2002,7 +2029,8 @@ __global__ void cache_kernel(
     if (seq_lens[ori_bi] == 0) continue;
     const uint32_t ori_seq_id = ori_token_idx % max_seq_len + pre_cache_length;
 
-    const int32_t *block_table_now = block_tables + ori_bi * max_blocks_per_seq;
+    const auto *block_table_now(block_tables + ori_bi * max_blocks_per_seq);
+
     const uint32_t block_idx = block_table_now[ori_seq_id / block_size];
     const uint32_t block_offset = ori_seq_id % block_size;
 
@@ -2060,16 +2088,16 @@ __global__ void write_pre_cache_int8_to_cache(
        linear_index += step) {
     const int batch_id = linear_index / offset;
     if (seq_lens[batch_id] == 0) continue;
-    const int *block_table_now = block_tables + batch_id * max_blocks_per_seq;
+    const auto *block_table_now(block_tables + batch_id * max_blocks_per_seq);
 
     const int32_t cache_seq_id = (linear_index % hidden_size) / head_size;
     const int32_t head_id = (linear_index % cache_hidden_size) / hidden_size;
     const int32_t size_id = linear_index % head_size;
 
     const int32_t kv_id = (linear_index % offset) / cache_hidden_size;
-    const int32_t read_id = batch_id * cache_hidden_size +
-                            head_id * hidden_size + cache_seq_id * head_size +
-                            size_id;
+    const auto read_id(batch_id * cache_hidden_size + head_id * hidden_size +
+                       cache_seq_id * head_size + size_id);
+
     if (kv_id == 0) {
       phi::Load<T, VecSize>(&pre_key_cache[read_id], &src_vec);
     } else {
@@ -2079,9 +2107,9 @@ __global__ void write_pre_cache_int8_to_cache(
     const int block_idx = block_table_now[cache_seq_id / block_size];
     const int block_offset = cache_seq_id % block_size;
 
-    const int tgt_idx = block_idx * num_heads * block_size * head_size +
-                        head_id * block_size * head_size +
-                        block_offset * head_size + size_id;
+    const auto tgt_idx(block_idx * num_heads * block_size * head_size +
+                       head_id * block_size * head_size +
+                       block_offset * head_size + size_id);
 
     const float scale =
         kv_id == 0 ? cache_k_scales[head_id] : cache_v_scales[head_id];
@@ -2147,16 +2175,16 @@ __global__ void write_pre_cache_to_cache(
        linear_index += step) {
     const int batch_id = linear_index / offset;
     if (seq_lens[batch_id] == 0) continue;
-    const int *block_table_now = block_tables + batch_id * max_blocks_per_seq;
+    const auto *block_table_now(block_tables + batch_id * max_blocks_per_seq);
 
     const int32_t cache_seq_id = (linear_index % hidden_size) / head_size;
     const int32_t head_id = (linear_index % cache_hidden_size) / hidden_size;
     const int32_t size_id = linear_index % head_size;
 
     const int32_t kv_id = (linear_index % offset) / cache_hidden_size;
-    const int32_t read_id = batch_id * cache_hidden_size +
-                            head_id * hidden_size + cache_seq_id * head_size +
-                            size_id;
+    const auto read_id(batch_id * cache_hidden_size + head_id * hidden_size +
+                       cache_seq_id * head_size + size_id);
+
     if (kv_id == 0) {
       phi::Load<T, VecSize>(&pre_key_cache[read_id], &src_vec);
     } else {
@@ -2166,9 +2194,9 @@ __global__ void write_pre_cache_to_cache(
     const int block_idx = block_table_now[cache_seq_id / block_size];
     const int block_offset = cache_seq_id % block_size;
 
-    const int tgt_idx = block_idx * num_heads * block_size * head_size +
-                        head_id * block_size * head_size +
-                        block_offset * head_size + size_id;
+    const auto tgt_idx(block_idx * num_heads * block_size * head_size +
+                       head_id * block_size * head_size +
+                       block_offset * head_size + size_id);
 
     if (kv_id == 0) {
       phi::Store<T, VecSize>(src_vec, &key_cache[tgt_idx]);
@@ -2208,7 +2236,7 @@ void CacheKernel(const phi::GPUContext &dev_ctx,
   const int32_t block_size = key_cache_out->dims()[2];
 
   // stage 1: write qkv to cache [pre_cache_length:]
-  int elem_nums = num_tokens * 2 * kv_num_heads * head_size;  // just k and v
+  auto elem_nums = num_tokens * 2 * kv_num_heads * head_size;  // just k and v
   constexpr int PackSize = 16 / sizeof(T);
   int pack_num = elem_nums / PackSize;
   const int blocksize = 128;
@@ -2376,9 +2404,9 @@ __global__ void quant_write_cache_int8_kernel(
        idx += blockDim.x * VecSize) {
     int token_idx = idx / head_size;
     int h_offset = idx % head_size;
-    int linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size +
-                     (qkv_id + head_group_size) * kv_num_heads * head_size +
-                     hi * head_size + h_offset;
+    auto linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size +
+                      (qkv_id + head_group_size) * kv_num_heads * head_size +
+                      hi * head_size + h_offset;
 
     Load<T, VecSize>(qkv + linear_idx, &in_vec);
 #pragma unroll
@@ -2408,9 +2436,9 @@ __global__ void quant_write_cache_int8_kernel(
        idx += blockDim.x * VecSize) {
     int token_idx = idx / head_size;
     int h_offset = idx % head_size;
-    int linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size +
-                     (qkv_id + head_group_size) * kv_num_heads * head_size +
-                     hi * head_size + h_offset;
+    auto linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size +
+                      (qkv_id + head_group_size) * kv_num_heads * head_size +
+                      hi * head_size + h_offset;
 
     Load<T, VecSize>(qkv + linear_idx, &in_vec);
 #pragma unroll
@@ -2423,7 +2451,8 @@ __global__ void quant_write_cache_int8_kernel(
     if (ori_bi != b_id) continue;
     const int ori_seq_id = ori_token_idx % max_seq_len + pre_cache_length;
 
-    const int *block_table_now = block_tables + ori_bi * max_blocks_per_seq;
+    const auto *block_table_now(block_tables + ori_bi * max_blocks_per_seq);
+
     const int block_idx = block_table_now[ori_seq_id / block_size];
     const int block_offset = ori_seq_id % block_size;
     // [max_block_num, num_head, block_size, head_dim/x, x]
@@ -2518,8 +2547,9 @@ void DynamicQuantCacheKernel(
 
   if (pre_key_cache) {
     // stage 2: write pre_cache to cache [:pre_cache_length]
-    const int elem_nums =
-        batch_size * kv_num_heads * pre_cache_length * head_size * 2;
+    const auto elem_nums(batch_size * kv_num_heads * pre_cache_length *
+                         head_size * 2);
+
     const int pack_num = elem_nums / PackSize;
     const int blocksize = 128;
     int grid_size = 1;
@@ -2586,7 +2616,8 @@ __global__ void VariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
+    const auto emb_idx(ori_seq_id * half_lastdim + h_bias / 2);
+
     const int64_t base_idx = token_idx * 3 * hidden_size +
                              qkv_id * hidden_size + hi * last_dim + h_bias;
     phi::Load<T, VecSize>(&qkv[base_idx], &src_vec);
@@ -2647,10 +2678,12 @@ __global__ void NeoxVariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * last_dim + h_bias;
-    const int base_idx_left = token_idx * 3 * full_hidden_size +
-                              qkv_id * full_hidden_size + hi * last_dim +
-                              h_bias;
+    const auto emb_idx(ori_seq_id * last_dim + h_bias);
+
+    const auto base_idx_left(token_idx * 3 * full_hidden_size +
+                             qkv_id * full_hidden_size + hi * last_dim +
+                             h_bias);
+
     const int base_idx_right = base_idx_left + half_lastdim;
 
     phi::Load<T, VecSize>(&qkv[base_idx_left], &left_vec);
@@ -2687,7 +2720,7 @@ void rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     bool use_neox_style = false) {
-  int elem_nums = token_num * 2 * head_num * dim_head;  // just q and k
+  auto elem_nums = token_num * 2 * head_num * dim_head;  // just q and k
   if (use_neox_style) {
     elem_nums = token_num * head_num * dim_head;
   }
@@ -2823,10 +2856,12 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * last_dim + h_bias;
-    const int base_idx_left =
-        token_idx * (q_num_head + 2 * kv_num_head) * last_dim + hi * last_dim +
-        h_bias;
+    const auto emb_idx(ori_seq_id * last_dim + h_bias);
+
+    const auto base_idx_left(token_idx * (q_num_head + 2 * kv_num_head) *
+                                 last_dim +
+                             hi * last_dim + h_bias);
+
     const int base_idx_right = base_idx_left + half_lastdim;
 
     phi::Load<T, VecSize>(&qkv[base_idx_left], &left_vec);
@@ -2864,7 +2899,7 @@ void gqa_rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     bool use_neox_style = false) {
-  int elem_nums =
+  auto elem_nums =
       token_num * (q_head_num + kv_head_num) * dim_head;  // just q and k
   if (use_neox_style) {
     elem_nums /= 2;
@@ -2951,8 +2986,10 @@ __global__ void VariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias;
+    const auto emb_idx(ori_seq_id * half_lastdim + h_bias / 2);
+
+    const auto bias_idx(qkv_id * hidden_size + hi * last_dim + h_bias);
+
     const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx;
     phi::Load<int, VecSize>(&qkv[base_idx], &src_vec);
     phi::Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
@@ -3033,11 +3070,14 @@ __global__ void NeoxVariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * last_dim + h_bias;
-    const int bias_idx_left =
-        qkv_id * full_hidden_size + hi * last_dim + h_bias;
+    const auto emb_idx(ori_seq_id * last_dim + h_bias);
+
+    const auto bias_idx_left(qkv_id * full_hidden_size + hi * last_dim +
+                             h_bias);
+
     const int bias_idx_right = bias_idx_left + half_lastdim;
-    const int base_idx_left = token_idx * 3 * full_hidden_size + bias_idx_left;
+    const auto base_idx_left(token_idx * 3 * full_hidden_size + bias_idx_left);
+
     const int base_idx_right = base_idx_left + half_lastdim;
     phi::Load<int, VecSize>(&qkv[base_idx_left], &left_vec);
     phi::Load<int, VecSize>(&qkv[base_idx_right], &right_vec);
@@ -3093,7 +3133,7 @@ void rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     bool use_neox_style = false) {
-  int elem_nums = token_num * 3 * head_num * dim_head;  // just q and k
+  auto elem_nums = token_num * 3 * head_num * dim_head;  // just q and k
   if (use_neox_style) {
     elem_nums = token_num * 3 * head_num * dim_head / 2;
   }
@@ -3258,10 +3298,13 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * last_dim + h_bias;
-    const int bias_idx_left = hi * last_dim + h_bias;
+    const auto emb_idx(ori_seq_id * last_dim + h_bias);
+
+    const auto bias_idx_left(hi * last_dim + h_bias);
+
     const int bias_idx_right = bias_idx_left + half_lastdim;
-    const int base_idx_left = token_idx * offset + bias_idx_left;
+    const auto base_idx_left(token_idx * offset + bias_idx_left);
+
     const int base_idx_right = base_idx_left + half_lastdim;
     phi::Load<int, VecSize>(&qkv[base_idx_left], &left_vec);
     phi::Load<int, VecSize>(&qkv[base_idx_right], &right_vec);
@@ -3318,7 +3361,7 @@ void gqa_rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     bool use_neox_style = false) {
-  int elem_nums =
+  auto elem_nums =
       token_num * (q_head_num + 2 * kv_head_num) * dim_head;  // for all q k v
   if (use_neox_style) {
     elem_nums /= 2;
@@ -3405,8 +3448,10 @@ __global__ void VariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias;
+    const auto emb_idx(ori_seq_id * half_lastdim + h_bias / 2);
+
+    const auto bias_idx(qkv_id * hidden_size + hi * last_dim + h_bias);
+
     const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx;
     phi::Load<T, VecSize>(&qkv[base_idx], &src_vec);
     phi::Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
@@ -3477,11 +3522,14 @@ __global__ void NeoxVariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * last_dim + h_bias;
-    const int bias_idx_left =
-        qkv_id * full_hidden_size + hi * last_dim + h_bias;
+    const auto emb_idx(ori_seq_id * last_dim + h_bias);
+
+    const auto bias_idx_left(qkv_id * full_hidden_size + hi * last_dim +
+                             h_bias);
+
     const int bias_idx_right = bias_idx_left + half_lastdim;
-    const int base_idx_left = token_idx * 3 * full_hidden_size + bias_idx_left;
+    const auto base_idx_left(token_idx * 3 * full_hidden_size + bias_idx_left);
+
     const int base_idx_right = base_idx_left + half_lastdim;
     phi::Load<int, VecSize>(&qkv[base_idx_left], &left_vec);
     phi::Load<int, VecSize>(&qkv[base_idx_right], &right_vec);
@@ -3528,7 +3576,7 @@ void rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     bool use_neox_style = false) {
-  int elem_nums = token_num * 3 * head_num * dim_head;  // just q and k
+  auto elem_nums = token_num * 3 * head_num * dim_head;  // just q and k
   if (use_neox_style) {
     elem_nums = token_num * 3 * head_num * dim_head / 2;
   }
@@ -3677,10 +3725,13 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
 
     const int ori_seq_id = ori_token_idx % seq_len;
 
-    const int emb_idx = ori_seq_id * last_dim + h_bias;
-    const int bias_idx_left = hi * last_dim + h_bias;
+    const auto emb_idx(ori_seq_id * last_dim + h_bias);
+
+    const auto bias_idx_left(hi * last_dim + h_bias);
+
     const int bias_idx_right = bias_idx_left + half_lastdim;
-    const int base_idx_left = token_idx * offset + bias_idx_left;
+    const auto base_idx_left(token_idx * offset + bias_idx_left);
+
     const int base_idx_right = base_idx_left + half_lastdim;
     phi::Load<int, VecSize>(&qkv[base_idx_left], &left_vec);
     phi::Load<int, VecSize>(&qkv[base_idx_right], &right_vec);
@@ -3728,7 +3779,7 @@ void gqa_rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     bool use_neox_style = false) {
-  int elem_nums =
+  auto elem_nums =
       token_num * (q_head_num + 2 * kv_head_num) * dim_head;  // for all q k v
   if (use_neox_style) {
     elem_nums /= 2;
@@ -3970,19 +4021,21 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf,
 
     // [token_num, q_head_num or kv_head_num, size_per_head]
     if (head_id < q_head_num) {
-      const int32_t write_idx = token_idx * q_head_num * size_per_head +
-                                head_id * size_per_head + size_id;
+      const auto write_idx(token_idx * q_head_num * size_per_head +
+                           head_id * size_per_head + size_id);
+
       phi::Store<T, VecSize>(src_vec, &q_buf[write_idx]);
     } else {
       if (head_id < q_head_num + kv_head_num) {
-        const int32_t write_idx = token_idx * kv_head_num * size_per_head +
-                                  (head_id - q_head_num) * size_per_head +
-                                  size_id;
+        const auto write_idx(token_idx * kv_head_num * size_per_head +
+                             (head_id - q_head_num) * size_per_head + size_id);
+
         phi::Store<T, VecSize>(src_vec, &k_buf[write_idx]);
       } else {
-        const int32_t write_idx =
+        const auto write_idx(
             token_idx * kv_head_num * size_per_head +
-            (head_id - q_head_num - kv_head_num) * size_per_head + size_id;
+            (head_id - q_head_num - kv_head_num) * size_per_head + size_id);
+
         phi::Store<T, VecSize>(src_vec, &v_buf[write_idx]);
       }
     }
@@ -4003,8 +4056,9 @@ void qkv_transpose_split(const phi::GPUContext &dev_ctx,
                          const int kv_head_num,
                          const int seq_len,
                          const int size_per_head) {
-  const int32_t elem_cnt =
-      token_num * (q_head_num + kv_head_num * 2) * size_per_head;
+  const auto elem_cnt(token_num * (q_head_num + kv_head_num * 2) *
+                      size_per_head);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(size_per_head % PackSize,
                     0,
@@ -4065,9 +4119,9 @@ __global__ void write_pre_cache_to_kv_buffer(
     const int32_t kv_id =
         (linear_index % fused_hidden_size) / cache_hidden_size;
 
-    const int32_t read_id = batch_id * cache_hidden_size +
-                            head_id * hidden_size + cache_seq_id * head_dim +
-                            size_id;
+    const auto read_id(batch_id * cache_hidden_size + head_id * hidden_size +
+                       cache_seq_id * head_dim + size_id);
+
     if (kv_id == 0) {
       phi::Load<T, VecSize>(&pre_key_cache[read_id], &src_vec);
     } else {
@@ -4075,10 +4129,11 @@ __global__ void write_pre_cache_to_kv_buffer(
     }
 
     const int tmp_max_len_this_time = max_len_this_time + pre_cache_length;
-    const int32_t write_idx =
-        batch_id * num_head * tmp_max_len_this_time * head_dim +
-        head_id * tmp_max_len_this_time * head_dim + cache_seq_id * head_dim +
-        size_id;
+    const auto write_idx(batch_id * num_head * tmp_max_len_this_time *
+                             head_dim +
+                         head_id * tmp_max_len_this_time * head_dim +
+                         cache_seq_id * head_dim + size_id);
+
     if (kv_id == 0) {
       phi::Store<T, VecSize>(src_vec, &k_buf[write_idx]);
     } else {
@@ -4126,28 +4181,31 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf,
 
     const int tmp_max_len_this_time =
         max_len_this_time + (head_id < q_head_num ? 0 : pre_cache_length);
-    const int tmp_seq_id =
-        head_id < q_head_num ? seq_id : seq_id + pre_cache_length;
+    const auto tmp_seq_id(head_id < q_head_num ? seq_id
+                                               : seq_id + pre_cache_length);
 
     if (head_id < q_head_num) {
-      const int write_idx =
-          target_batch_id * q_head_num * tmp_max_len_this_time * size_per_head +
-          head_id * tmp_max_len_this_time * size_per_head +
-          tmp_seq_id * size_per_head + size_id;
+      const auto write_idx(target_batch_id * q_head_num *
+                               tmp_max_len_this_time * size_per_head +
+                           head_id * tmp_max_len_this_time * size_per_head +
+                           tmp_seq_id * size_per_head + size_id);
+
       phi::Store<T, VecSize>(src_vec, &q_buf[write_idx]);
     } else if (head_id < q_head_num + kv_head_num) {
-      const int write_idx =
-          target_batch_id * kv_head_num * tmp_max_len_this_time *
-              size_per_head +
-          (head_id - q_head_num) * tmp_max_len_this_time * size_per_head +
-          tmp_seq_id * size_per_head + size_id;
+      const auto write_idx(target_batch_id * kv_head_num *
+                               tmp_max_len_this_time * size_per_head +
+                           (head_id - q_head_num) * tmp_max_len_this_time *
+                               size_per_head +
+                           tmp_seq_id * size_per_head + size_id);
+
       phi::Store<T, VecSize>(src_vec, &k_buf[write_idx]);
     } else {
-      const int write_idx = target_batch_id * kv_head_num *
-                                tmp_max_len_this_time * size_per_head +
-                            (head_id - q_head_num - kv_head_num) *
-                                tmp_max_len_this_time * size_per_head +
-                            tmp_seq_id * size_per_head + size_id;
+      const auto write_idx(target_batch_id * kv_head_num *
+                               tmp_max_len_this_time * size_per_head +
+                           (head_id - q_head_num - kv_head_num) *
+                               tmp_max_len_this_time * size_per_head +
+                           tmp_seq_id * size_per_head + size_id);
+
       phi::Store<T, VecSize>(src_vec, &v_buf[write_idx]);
     }
   }
@@ -4172,7 +4230,7 @@ void qkv_transpose_split(
     const int seq_len,
     const int pre_cache_length,
     const int size_per_head) {
-  int32_t elem_cnt = token_num * (q_head_num + kv_head_num * 2) * size_per_head;
+  auto elem_cnt = token_num * (q_head_num + kv_head_num * 2) * size_per_head;
 
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(size_per_head % PackSize,
@@ -4241,8 +4299,10 @@ __global__ void GetDecoderTensorKernel(const T *qkv_out,
        i += gridDim.x * blockDim.x * VecSize) {
     const int bi = i / fused_hidden_size;
     const int bias_idx = i % fused_hidden_size;
-    const int ori_token_idx = bi * seq_len - cum_offsets[bi];
-    const int src_offset = ori_token_idx * fused_hidden_size + bias_idx;
+    const auto ori_token_idx(bi * seq_len - cum_offsets[bi]);
+
+    const auto src_offset(ori_token_idx * fused_hidden_size + bias_idx);
+
     if (src_offset >= qkv_out_nums) continue;
     phi::Load<T, VecSize>(&qkv_out[src_offset], &src_vec);
     phi::Store<T, VecSize>(src_vec, &qkv_out_decoder[i]);
@@ -4267,7 +4327,8 @@ __global__ void GetDecoderRoPEKernel(const T *rope_emb,
   for (int i = global_idx * VecSize; i < elem_nums;
        i += gridDim.x * blockDim.x * VecSize) {
     const int bi = i / dim_head;
-    const int src_offset = bi * seq_len * dim_head + i % dim_head;
+    const auto src_offset(bi * seq_len * dim_head + i % dim_head);
+
     phi::Load<T, VecSize>(&rope_cos_emb[src_offset], &src_vec);
     phi::Store<T, VecSize>(src_vec, &cos_emb[i]);
     phi::Load<T, VecSize>(&rope_sin_emb[src_offset], &src_vec);
@@ -4429,9 +4490,10 @@ __global__ void TransposeRemovingPadding(const T *input_data,
     const int ori_seq_id = ori_token_idx % seq_len;
     const int ori_head_id = (linear_index % dim_embed) / head_dim;
     const int ori_head_lane = (linear_index % dim_embed) % head_dim;
-    const int ori_idx = ori_batch_id * num_head * max_len_this_time * head_dim +
-                        ori_head_id * max_len_this_time * head_dim +
-                        ori_seq_id * head_dim + ori_head_lane;
+    const auto ori_idx(ori_batch_id * num_head * max_len_this_time * head_dim +
+                       ori_head_id * max_len_this_time * head_dim +
+                       ori_seq_id * head_dim + ori_head_lane);
+
     phi::Load<T, VecSize>(&input_data[ori_idx], &src_vec);
     phi::Store<T, VecSize>(src_vec, &output_data[linear_index]);
   }
@@ -4452,7 +4514,8 @@ void InvokeTransposeRemovePadding(const phi::GPUContext &dev_ctx,
   // [batch_size, num_head, max_len_this_time, head_dim] -> [token_num,
   // num_head, head_dim]
   constexpr int VEC_16B = 16;
-  const int elem_cnt = token_num * num_head * head_dim;
+  const auto elem_cnt(token_num * num_head * head_dim);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(
       head_dim % PackSize,
diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
index dc2d495f7bb18d..f6056dd8215952 100644
--- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
@@ -341,7 +341,8 @@ void DispatchWithDtype(
   const int kv_num_head = key_cache_dims[1];
   const int dim_head = key_cache_dims[3];
   const int total_num_head = qkv.dims()[qkv.dims().size() - 1] / dim_head;
-  const int q_num_head = total_num_head - 2 * kv_num_head;
+  const auto q_num_head(total_num_head - 2 * kv_num_head);
+
   const int bsz = cum_offsets.dims()[0];
   const int max_block_per_seq = block_tables.dims()[1];
   VLOG(3) << "bsz: " << bsz << " token_num: " << token_num
diff --git a/paddle/phi/kernels/fusion/gpu/fmha_ref.h b/paddle/phi/kernels/fusion/gpu/fmha_ref.h
index 98e456f177e27e..7b71bdbbe2817a 100644
--- a/paddle/phi/kernels/fusion/gpu/fmha_ref.h
+++ b/paddle/phi/kernels/fusion/gpu/fmha_ref.h
@@ -95,9 +95,10 @@ __global__ void TransposeRemovingPadding(const T* input_data,
     const int ori_seq_id = ori_token_idx % seq_len;
     const int ori_head_id = (linear_index % dim_embed) / head_dim;
     const int ori_head_lane = (linear_index % dim_embed) % head_dim;
-    const int ori_idx = ori_batch_id * num_head * seq_len * head_dim +
-                        ori_head_id * seq_len * head_dim +
-                        ori_seq_id * head_dim + ori_head_lane;
+    const auto ori_idx(ori_batch_id * num_head * seq_len * head_dim +
+                       ori_head_id * seq_len * head_dim +
+                       ori_seq_id * head_dim + ori_head_lane);
+
     phi::Load<T, VecSize>(&input_data[ori_idx], &src_vec);
     phi::Store<T, VecSize>(src_vec, &output_data[linear_index]);
   }
@@ -116,7 +117,8 @@ void InvokeTransposeRemovePadding(const phi::GPUContext& dev_ctx,
   // [batch_size, num_head, seq_len, head_dim] -> [token_num, num_head,
   // head_dim]
   constexpr int VEC_16B = 16;
-  const int elem_cnt = token_num * num_head * head_dim;
+  const auto elem_cnt(token_num * num_head * head_dim);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(
       head_dim % PackSize,
@@ -535,7 +537,7 @@ class FMHARef {
                        phi::DenseTensor* src_mask_grad_tensor,
                        phi::DenseTensor* qkv_input_grad_tensor) {
     auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
-    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
+    auto q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     int k_size = q_size;
     int softmax_axis = -1;
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu
index 3b3c78e45fad23..24cca298143663 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu
@@ -59,8 +59,8 @@ void ComputeMergedQKVMatmulBackward(
   dev_ctx.Alloc<T>(qkv_weight_grad, qkv_weight_grad->numel() * sizeof(T));
 
   // Gradient of GEMM(query, qkv_weight)
-  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
-  int n = 3 * config.num_heads * config.head_dim;
+  auto m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto n = 3 * config.num_heads * config.head_dim;
   int k = config.q_dim;
   auto qkv_compute =
       phi::fusion::AttnMatMul<T>(dev_ctx, false, true, m, n, k, false);
@@ -95,7 +95,7 @@ void ComputeSeparatedQKVMatmulBackward(
   const auto *key_weight = &key_weight_in;
   dev_ctx.Alloc<T>(key_weight_grad, key_weight_grad->numel() * sizeof(T));
 
-  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  auto kv_m = config.batch_size * config.seq_len_m * config.m_size;
   int kv_n = config.num_heads * config.head_dim;
   int kv_k = config.kv_dim;
   auto kv_compute = phi::fusion::AttnMatMul<T>(
@@ -119,7 +119,7 @@ void ComputeSeparatedQKVMatmulBackward(
   const auto *query_weight = &query_weight_in;
   dev_ctx.Alloc<T>(query_weight_grad, query_weight_grad->numel() * sizeof(T));
 
-  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int q_n = config.num_heads * config.head_dim;
   int q_k = config.q_dim;
   auto q_compute =
@@ -155,7 +155,7 @@ void ComputeGatingLinearBackward(
   gate_bias_out.Resize(config.gate_out_dims);
   dev_ctx.Alloc<T>(&gate_bias_out, gate_bias_out.numel() * sizeof(T));
 
-  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.num_heads * config.head_dim;
   int k = config.q_dim;
   auto gate_linear =
@@ -208,7 +208,7 @@ void ComputeOutputLinearBackward(
   dev_ctx.Alloc<T>(out_linear_bias_grad,
                    out_linear_bias_grad->numel() * sizeof(T));
 
-  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.q_dim;
   int k = config.num_heads * config.head_dim;
   auto out_linear =
diff --git a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu
index d1722a5006ce64..1d896ba7b32a3a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu
@@ -49,8 +49,8 @@ void ComputeMergedQKVMatmulForward(
   auto *qkv_weight = &qkv_weight_in;
 
   // qkv_out = GEMM(query, qkv_weight^T)
-  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
-  int n = 3 * config.num_heads * config.head_dim;
+  auto m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto n = 3 * config.num_heads * config.head_dim;
   int k = config.q_dim;
   auto qkv_compute =
       phi::fusion::AttnMatMul<T>(dev_ctx, false, true, m, n, k, false);
@@ -77,7 +77,7 @@ void ComputeSeparatedQKVMatmulForward(
   // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
   // query_weight: shape=[q_dim, num_heads, head_dim]
   // query_out: shape=[batch_size, seq_len_m, seq_len_r, num_heads, head_dim]
-  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int q_n = config.num_heads * config.head_dim;
   int q_k = config.q_dim;
   auto q_compute =
@@ -88,7 +88,7 @@ void ComputeSeparatedQKVMatmulForward(
   // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
   // key_weight: shape=[kv_dim, num_heads, head_dim]
   // key_out: shape=[batch_size, seq_len_m, m_size, num_heads, head_dim]
-  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  auto kv_m = config.batch_size * config.seq_len_m * config.m_size;
   int kv_n = config.num_heads * config.head_dim;
   int kv_k = config.kv_dim;
   auto kv_compute = phi::fusion::AttnMatMul<T>(
@@ -116,7 +116,7 @@ void ComputeGatingLinearForward(
   // and the second gate_bias_out stores the result of the multiplication +
   // bias.
   //   gate_out = GEMM(query, gate_weight) + gate_bias
-  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.num_heads * config.head_dim;
   int k = config.q_dim;
   auto gate_linear =
@@ -148,7 +148,7 @@ void ComputeOutputLinearForward(
   const auto *out_linear_bias = &out_linear_bias_in;
 
   // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
-  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  auto m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.q_dim;
   int k = config.num_heads * config.head_dim;
   auto out_linear =
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
index 9d4bb18d559ff6..5ec0b78974d3d8 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
@@ -343,7 +343,7 @@ __global__ void FusedLayernormResidualDropoutBiasInfer(
     T *layernorm_dst) {
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
-  int idx = row_id * cols + col_id;
+  auto idx = row_id * cols + col_id;
   GPURAND(StatePhilox4_32_10_t) state;
   GPURAND(_init)(seed, idx, increment, &state);
 
@@ -579,10 +579,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
   const int warp_n = warp % WARPS_N;         // 0
   const int warp_m = warp / WARPS_N;         // 0, 1, 2, 3
 
-  const int c = warp_n * THREADS_PER_WARP + lane;  // lane
-  const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
+  const auto c(warp_n * THREADS_PER_WARP + lane);
+  // lane
+  const auto r(bidx * ROWS_PER_CTA + warp_m);
+  // row id
 
-  int idx = r * ELTS_PER_ROW + c;
+  auto idx = r * ELTS_PER_ROW + c;
   GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
     GPURAND(_init)(seed, idx, increment, &state);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu
index 72c5453b439ff6..ec536b9ca97f29 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu
@@ -208,9 +208,9 @@ void FusedMultiTransformerOpKernel(
     dim_head = trans_qkvw ? qkv_w_dims[2] : qkv_w_dims[3];
   }
   int hidden_size = num_head * dim_head;
-  int output_size = gqa_group_size <= 0
-                        ? 3 * hidden_size
-                        : (num_head + 2 * gqa_group_size) * dim_head;
+  auto output_size = gqa_group_size <= 0
+                         ? 3 * hidden_size
+                         : (num_head + 2 * gqa_group_size) * dim_head;
   int input_size = dim_embed;
 
   // Set a flag whether need to add Matmul / Layernorm bias.
diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h
index a8191bc6b4a313..5970013c790e62 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h
@@ -153,11 +153,15 @@ __global__ void masked_multihead_attention_kernel(
   const int hi = blockIdx.x;
   const int kv_hi =
       hi / params.gqa_num_per_partitions;  // if no gqa, kv_hi = hi
-  const int bhi = bi * params.num_head + hi;
-  const int bbhi = bbi * params.beam_width * params.num_head + hi;
-  const int ti =
-      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1;
-  const int thi = params.cum_offsets ? ti * params.num_head + hi : -1;
+  const auto bhi(bi * params.num_head + hi);
+
+  const auto bbhi(bbi * params.beam_width * params.num_head + hi);
+
+  const auto ti(
+      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1);
+
+  const auto thi(params.cum_offsets ? ti * params.num_head + hi : -1);
+
   const int tid = threadIdx.x;
 
   const int bi_seq_len_offset = bi * params.max_seq_length;
@@ -170,7 +174,8 @@ __global__ void masked_multihead_attention_kernel(
                           : params.sequence_lengths[bi];
 
   // qkv [B, S=1, num_head + 2 * gqa_group_size, head_dim]
-  int qkv_base_offset = bi * (params.num_head + 2 * params.gqa_group_size) * Dh;
+  auto qkv_base_offset =
+      bi * (params.num_head + 2 * params.gqa_group_size) * Dh;
 
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
   static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
@@ -194,9 +199,10 @@ __global__ void masked_multihead_attention_kernel(
   }
 
   if (tid < QK_VECS_PER_WARP) {
-    int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
-    const int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
-    const int k_bias_offset = kv_hi * Dh + tid * QK_VEC_SIZE;
+    auto qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    const auto q_bias_offset(hi * Dh + tid * QK_VEC_SIZE);
+
+    const auto k_bias_offset(kv_hi * Dh + tid * QK_VEC_SIZE);
 
     Qk_vec q;
     zero(q);
@@ -234,7 +240,7 @@ __global__ void masked_multihead_attention_kernel(
 
     if (!params.neox_rotary_style) {
       if (params.rotary_emb_dims != 0) {
-        int rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh;
         Qk_vec_RoPE cos_emb, sin_emb;
@@ -255,16 +261,17 @@ __global__ void masked_multihead_attention_kernel(
       if (params.rotary_emb_dims != 0) {
         int last_dim = Dh / params.rotary_emb_dims;
         int half_lastdim = last_dim / 2;
-        int rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh;
         int stride = half_lastdim / QK_VEC_SIZE;
         int stride_all_lastdim = 2 * stride;
-        int right_id = tid / stride_all_lastdim * stride_all_lastdim +
-                       (tid + stride) % (stride_all_lastdim);
-        int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
-        int k_right_offset = qkv_base_offset + params.num_head * Dh +
-                             kv_hi * Dh + right_id * QK_VEC_SIZE;
+        auto right_id = tid / stride_all_lastdim * stride_all_lastdim +
+                        (tid + stride) % (stride_all_lastdim);
+        auto q_right_offset =
+            qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
+        auto k_right_offset = qkv_base_offset + params.num_head * Dh +
+                              kv_hi * Dh + right_id * QK_VEC_SIZE;
         Qk_vec q_right;
         zero(q_right);
         if (Dh == Dh_MAX || right_id * QK_VEC_SIZE < Dh) {
@@ -304,10 +311,10 @@ __global__ void masked_multihead_attention_kernel(
     int co = tid / QK_VECS_IN_16B;
     int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE;
 
-    int offset = bi * params.gqa_group_size * params.max_seq_length * Dh +
-                 kv_hi * params.max_seq_length * Dh +
-                 co * params.max_seq_length * QK_ELTS_IN_16B +
-                 act_time_step * QK_ELTS_IN_16B + ci;
+    auto offset = bi * params.gqa_group_size * params.max_seq_length * Dh +
+                  kv_hi * params.max_seq_length * Dh +
+                  co * params.max_seq_length * QK_ELTS_IN_16B +
+                  act_time_step * QK_ELTS_IN_16B + ci;
     if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
       *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
     }
@@ -376,7 +383,7 @@ __global__ void masked_multihead_attention_kernel(
     zero(k_vec_zero);
 #pragma unroll
     for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * params.max_seq_length + ti;
+      auto jj = ii * params.max_seq_length + ti;
       // get k from the cache_kv, and dequant k for qk operation
       if (ti < act_time_step) {
         k[ii] =
@@ -803,17 +810,19 @@ __global__ void multi_block_masked_multihead_attention_kernel(
   const int hi = blockIdx.x;  // head_idx
   const int kv_hi = hi / params.gqa_num_per_partitions;
 
-  const int bhi = bi * params.num_head + hi;
-  const int ti =
-      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1;
-  const int thi = params.cum_offsets ? ti * params.num_head + hi : -1;
+  const auto bhi(bi * params.num_head + hi);
+
+  const auto ti(
+      params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1);
+
+  const auto thi(params.cum_offsets ? ti * params.num_head + hi : -1);
 
   float qk_max = -FLT_MAX;
   float qk = 0;
 
   // qkv [B, S=1, 3, num_head, head_dim]
-  int qkv_base_offset = bi * (params.num_head + 2 * params.gqa_group_size) *
-                        Dh;  // // if no gqa, gqa_group_size = num_head
+  auto qkv_base_offset = bi * (params.num_head + 2 * params.gqa_group_size) *
+                         Dh;  // // if no gqa, gqa_group_size = num_head
 
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
 
@@ -836,7 +845,7 @@ __global__ void multi_block_masked_multihead_attention_kernel(
   }
 
   if (tid < QK_VECS_PER_WARP) {
-    const int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    const auto qk_offset(qkv_base_offset + tid * QK_VEC_SIZE);
 
     Qk_vec q;
     zero(q);
@@ -852,8 +861,10 @@ __global__ void multi_block_masked_multihead_attention_kernel(
     }
 
     if (params.add_qkv_bias) {
-      const int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
-      const int k_bias_offset = kv_hi * Dh + tid * QK_VEC_SIZE;
+      const auto q_bias_offset(hi * Dh + tid * QK_VEC_SIZE);
+
+      const auto k_bias_offset(kv_hi * Dh + tid * QK_VEC_SIZE);
+
       Qk_vec q_bias;
       zero(q_bias);
       Qk_vec k_bias;
@@ -874,7 +885,7 @@ __global__ void multi_block_masked_multihead_attention_kernel(
 
     if (!params.neox_rotary_style) {
       if (params.rotary_emb_dims != 0) {
-        int rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh;
         Qk_vec_RoPE cos_emb, sin_emb;
@@ -895,16 +906,17 @@ __global__ void multi_block_masked_multihead_attention_kernel(
       if (params.rotary_emb_dims != 0) {
         int last_dim = Dh / params.rotary_emb_dims;
         int half_lastdim = last_dim / 2;
-        int rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh;
         int stride = half_lastdim / QK_VEC_SIZE;
         int stride_all_lastdim = 2 * stride;
-        int right_id = tid / stride_all_lastdim * stride_all_lastdim +
-                       (tid + stride) % (stride_all_lastdim);
-        int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
-        int k_right_offset = qkv_base_offset + params.num_head * Dh +
-                             kv_hi * Dh + right_id * QK_VEC_SIZE;
+        auto right_id = tid / stride_all_lastdim * stride_all_lastdim +
+                        (tid + stride) % (stride_all_lastdim);
+        auto q_right_offset =
+            qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE;
+        auto k_right_offset = qkv_base_offset + params.num_head * Dh +
+                              kv_hi * Dh + right_id * QK_VEC_SIZE;
         Qk_vec q_right;
         zero(q_right);
         if (Dh == Dh_MAX || right_id * QK_VEC_SIZE < Dh) {
@@ -944,10 +956,10 @@ __global__ void multi_block_masked_multihead_attention_kernel(
       if (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh) {
         int co = tid / QK_VECS_IN_16B;
         int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE;
-        int offset = bi * params.gqa_group_size * params.max_seq_length * Dh +
-                     kv_hi * params.max_seq_length * Dh +
-                     co * params.max_seq_length * QK_ELTS_IN_16B +
-                     act_time_step * QK_ELTS_IN_16B + ci;
+        auto offset = bi * params.gqa_group_size * params.max_seq_length * Dh +
+                      kv_hi * params.max_seq_length * Dh +
+                      co * params.max_seq_length * QK_ELTS_IN_16B +
+                      act_time_step * QK_ELTS_IN_16B + ci;
         *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
       }
 
@@ -1011,13 +1023,13 @@ __global__ void multi_block_masked_multihead_attention_kernel(
   for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
     // First, move each block to their start position.
     const int time_now = ti + partition_times_timesteps_per_block;
-    const int k_offset =
+    const auto k_offset(
         bi * params.gqa_group_size * params.max_seq_length * Dh +
-        kv_hi * params.max_seq_length * Dh + time_now * Dh + ki;
+        kv_hi * params.max_seq_length * Dh + time_now * Dh + ki);
 
 #pragma unroll
     for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * params.max_seq_length + time_now;
+      auto jj = ii * params.max_seq_length + time_now;
       if (time_now < act_time_step) {
         k[ii] =
             (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.max_seq_length)
@@ -1209,9 +1221,10 @@ __global__ void multi_block_masked_multihead_attention_kernel(
 
   if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
     // Compute the index to store in `partial_out`.
-    const int32_t store_partial_idx =
+    const auto store_partial_idx(
         bi * params.num_head * params.max_num_partitions * Dh +
-        hi * params.max_num_partitions * Dh + partition_idx * Dh + vi;
+        hi * params.max_num_partitions * Dh + partition_idx * Dh + vi);
+
     // Actually, we do not need the store_func, just use T vectorized type
     // `V_vec` to store in params.partial_out.
 #ifdef MMHA_USE_FP32_ACUM_FOR_OUT
@@ -1240,11 +1253,13 @@ __launch_bounds__(THREADS_PER_BLOCK) void multi_block_attention_reduce_kernel(
     return;
   }
 
-  const int bhi = seq_idx * params.num_head + head_idx;
-  const int ti = params.cum_offsets
-                     ? seq_idx * params.seq_len - params.cum_offsets[seq_idx]
-                     : -1;
-  const int thi = params.cum_offsets ? ti * params.num_head + head_idx : -1;
+  const auto bhi(seq_idx * params.num_head + head_idx);
+
+  const auto ti(params.cum_offsets
+                    ? seq_idx * params.seq_len - params.cum_offsets[seq_idx]
+                    : -1);
+
+  const auto thi(params.cum_offsets ? ti * params.num_head + head_idx : -1);
 
   const int num_partitions = div_up(context_len, params.partition_size);
   if (num_partitions == 1) {
@@ -1367,7 +1382,7 @@ inline size_t get_reduce_smem_size_in_bytes(
     const Masked_multihead_attention_params<T> &params) {
   const int32_t max_num_partitions =
       div_up(params.timestep, params.partition_size);
-  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
+  auto reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
   VLOG(1) << "get_reduce_smem_size_in_bytes, reduce_shared_mem_size: "
           << reduce_shared_mem_size;
   return reduce_shared_mem_size;
@@ -1776,8 +1791,8 @@ void write_cache_kv(const phi::GPUContext &dev_ctx,
       common::errors::PreconditionNotMet(
           "dim_head=%d must be divisible by vec_size=%d", dim_head, x));
 
-  int max_size = max_seq_len * dim_head / x;
-  int size = seq_len * dim_head / x;
+  auto max_size = max_seq_len * dim_head / x;
+  auto size = seq_len * dim_head / x;
   dim3 grid(div_up(max_size, block_sz), bsz, num_head);
   dim3 grid_v(div_up(size, block_sz), bsz, num_head);
 
@@ -1819,10 +1834,10 @@ __global__ void gqa_write_cache_k_kernel(T *cache_k,
 
     const int local_token_id = ori_token_id % seq_len;
 
-    const int tgt_idx = ori_bi * gqa_group_size * max_seq_len * dim_head +
-                        head_idx * max_seq_len * dim_head +
-                        head_vec_id * max_seq_len * X_ELEMS +
-                        local_token_id * X_ELEMS;
+    const auto tgt_idx(ori_bi * gqa_group_size * max_seq_len * dim_head +
+                       head_idx * max_seq_len * dim_head +
+                       head_vec_id * max_seq_len * X_ELEMS +
+                       local_token_id * X_ELEMS);
 
     phi::Load(&k[linear_idx], &in_vec);
     phi::Store(in_vec, &cache_k[tgt_idx]);
@@ -1855,9 +1870,9 @@ __global__ void gqa_write_cache_v_kernel(T *cache_v,
 
     const int local_token_id = ori_token_id % seq_len;
 
-    const int tgt_idx = ori_bi * gqa_group_size * max_seq_len * dim_head +
-                        head_idx * max_seq_len * dim_head +
-                        local_token_id * dim_head + head_offset;
+    const auto tgt_idx(ori_bi * gqa_group_size * max_seq_len * dim_head +
+                       head_idx * max_seq_len * dim_head +
+                       local_token_id * dim_head + head_offset);
 
     phi::Load(&v[linear_idx], &in_vec);
     phi::Store(in_vec, &cache_v[tgt_idx]);
@@ -1955,8 +1970,9 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf,
     const int32_t head_id = (linear_index % hidden_size) / size_per_head;
     const int32_t size_id = linear_index % size_per_head;
 
-    const int32_t write_idx =
-        token_idx * hidden_size + head_id * size_per_head + size_id;
+    const auto write_idx(token_idx * hidden_size + head_id * size_per_head +
+                         size_id);
+
     if (qkv_id == 0) {
       phi::Store<T, VecSize>(src_vec, &q_buf[write_idx]);
     } else if (qkv_id == 1) {
@@ -1980,7 +1996,8 @@ void qkv_transpose_split(const phi::GPUContext &dev_ctx,
                          const int head_num,
                          const int seq_len,
                          const int size_per_head) {
-  const int32_t elem_cnt = token_num * head_num * size_per_head * 3;
+  const auto elem_cnt(token_num * head_num * size_per_head * 3);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(size_per_head % PackSize,
                     0,
@@ -2020,7 +2037,8 @@ __global__ void add_fusedQKV_bias_transpose_split_kernel(
     const int token_num,
     const int head_num,
     const int size_per_head) {
-  const int32_t offset = batch_size * seq_len * head_num * size_per_head;
+  const auto offset(batch_size * seq_len * head_num * size_per_head);
+
   const int32_t hidden_size = head_num * size_per_head;
   const int32_t fused_hidden_size = 3 * hidden_size;
   int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -2101,7 +2119,8 @@ void qkv_bias_add_transpose_split(const phi::GPUContext &dev_ctx,
                                   const int seq_len,
                                   const int size_per_head,
                                   bool compute_bias) {
-  const int32_t elem_cnt = token_num * head_num * size_per_head * 3;
+  const auto elem_cnt(token_num * head_num * size_per_head * 3);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(size_per_head % PackSize,
                     0,
@@ -2179,19 +2198,21 @@ __global__ void gqa_fusedQKV_transpose_split_kernel(T *q_buf,
 
     // [token_num, num_head or gqa_group_size, size_per_head]
     if (head_id < head_num) {
-      const int32_t write_idx = token_idx * head_num * size_per_head +
-                                head_id * size_per_head + size_id;
+      const auto write_idx(token_idx * head_num * size_per_head +
+                           head_id * size_per_head + size_id);
+
       phi::Store<T, VecSize>(src_vec, &q_buf[write_idx]);
     } else {
       if (head_id < head_num + gqa_group_size) {
-        const int32_t write_idx = token_idx * gqa_group_size * size_per_head +
-                                  (head_id - head_num) * size_per_head +
-                                  size_id;
+        const auto write_idx(token_idx * gqa_group_size * size_per_head +
+                             (head_id - head_num) * size_per_head + size_id);
+
         phi::Store<T, VecSize>(src_vec, &k_buf[write_idx]);
       } else {
-        const int32_t write_idx =
+        const auto write_idx(
             token_idx * gqa_group_size * size_per_head +
-            (head_id - head_num - gqa_group_size) * size_per_head + size_id;
+            (head_id - head_num - gqa_group_size) * size_per_head + size_id);
+
         phi::Store<T, VecSize>(src_vec, &v_buf[write_idx]);
       }
     }
@@ -2212,8 +2233,9 @@ void gqa_qkv_transpose_split(const phi::GPUContext &dev_ctx,
                              const int seq_len,
                              const int size_per_head,
                              const int gqa_group_size) {
-  const int32_t elem_cnt =
-      token_num * (head_num + 2 * gqa_group_size) * size_per_head;
+  const auto elem_cnt(token_num * (head_num + 2 * gqa_group_size) *
+                      size_per_head);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(size_per_head % PackSize,
                     0,
@@ -2259,12 +2281,13 @@ __global__ void NeoXRotaryKernel(const T *input,
   if (sequence_lengths && si >= sequence_lengths[bi] * rotary_emb_dims) return;
   int half_lastdim = last_dim / 2;
   for (int ti = threadIdx.x; ti < half_lastdim; ti += blockDim.x) {
-    int base_idx = bi * head_num * seq_len * last_dim +
-                   hi * seq_len * last_dim + si * last_dim;
+    auto base_idx = bi * head_num * seq_len * last_dim +
+                    hi * seq_len * last_dim + si * last_dim;
     int left_idx = base_idx + ti;
-    const int right_idx = base_idx + ti + half_lastdim;
-    int emb_idx_left = bi * seq_len * last_dim + si * last_dim + ti;
-    int emb_idx_right =
+    const auto right_idx(base_idx + ti + half_lastdim);
+
+    auto emb_idx_left = bi * seq_len * last_dim + si * last_dim + ti;
+    auto emb_idx_right =
         bi * seq_len * last_dim + si * last_dim + ti + half_lastdim;
     float input_left = static_cast<float>(input[left_idx]);
     float input_right = static_cast<float>(input[right_idx]);
@@ -2302,11 +2325,12 @@ __global__ void RotaryKernel(const T *input,
   // Note(ZhenyuLi): Calculate the relevant data at one time, so that no
   // additional space is required.
   for (int ti = threadIdx.x; ti < half_lastdim; ti += blockDim.x) {
-    int base_idx = bi * head_num * seq_len * last_dim +
-                   hi * seq_len * last_dim + si * last_dim;
-    int left_idx = base_idx + 2 * ti;
-    const int right_idx = base_idx + 2 * ti + 1;
-    int emb_idx = bi * seq_len * last_dim + si * last_dim + 2 * ti;
+    auto base_idx = bi * head_num * seq_len * last_dim +
+                    hi * seq_len * last_dim + si * last_dim;
+    auto left_idx = base_idx + 2 * ti;
+    const auto right_idx(base_idx + 2 * ti + 1);
+
+    auto emb_idx = bi * seq_len * last_dim + si * last_dim + 2 * ti;
     float input_left = static_cast<float>(input[left_idx]);
     float input_right = static_cast<float>(input[right_idx]);
     float cos_tmp = cos_emb[emb_idx];
@@ -2659,7 +2683,7 @@ __global__ void BiasAct(const T *bias,
        i += gridDim.x * blockDim.x * VecSize) {
     int row_idx = i / cols;
     int col_idx = i % cols;
-    int linear_idx = row_idx * cols + col_idx;
+    auto linear_idx = row_idx * cols + col_idx;
     // phi::Load<T, VecSize>(&input[linear_idx], &src_vec);
     load_func.template load<VecSize>(&src_vec, linear_idx);
     if (bias) {
@@ -2730,8 +2754,8 @@ __global__ void fused_transpose_split_kernel(
     const int token_num,
     const int head_num,
     const int size_per_head) {
-  const int32_t offset =
-      batch_size * max_len_this_time * head_num * size_per_head;
+  const auto offset(batch_size * max_len_this_time * head_num * size_per_head);
+
   const int32_t hidden_size = head_num * size_per_head;
   const int32_t fused_hidden_size = 3 * hidden_size;
   int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -2776,7 +2800,7 @@ __global__ void fused_transpose_split_kernel(
                     seq_id * size_per_head + size_id],
           &src_vec);
     }
-    int32_t write_index =
+    auto write_index =
         linear_index - (qkv_id + 2 * current_token) * hidden_size;
     if (qkv_id == 0) {
       phi::Store<T, VecSize>(src_vec, &q_out[write_index]);
@@ -2803,7 +2827,8 @@ void TransposeSplit(const phi::GPUContext &dev_ctx,
                     const int max_len_this_time,
                     const int seq_len,
                     const int size_per_head) {
-  const int32_t elem_cnt = token_num * head_num * size_per_head * 3;
+  const auto elem_cnt(token_num * head_num * size_per_head * 3);
+
   constexpr int PackSize = VEC_16B / sizeof(T);
   PADDLE_ENFORCE_EQ(size_per_head % PackSize,
                     0,
@@ -2953,7 +2978,8 @@ void rotary_qk_variable(
     const int input_output_len,
     const int dim_head,
     const int rope_bsz) {
-  const int elem_nums = token_num * 3 * head_num * dim_head;  // just q and k
+  const auto elem_nums(token_num * 3 * head_num * dim_head);
+  // just q and k
   constexpr int PackSize = 16 / sizeof(T);
   const int pack_num = elem_nums / PackSize;
   const int blocksize = 128;
@@ -3067,8 +3093,8 @@ void gqa_rotary_qk_variable(
     const int dim_head,
     const int gqa_group_size,
     const int rope_bsz) {
-  const int elem_nums =
-      token_num * (head_num + 2 * gqa_group_size) * dim_head;  // for all q k v
+  const auto elem_nums(token_num * (head_num + 2 * gqa_group_size) * dim_head);
+  // for all q k v
   constexpr int PackSize = 16 / sizeof(T);
   const int pack_num = elem_nums / PackSize;
   const int blocksize = 128;
diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
index a7cd7aebb92c7f..332cb1364ba46d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
@@ -264,8 +264,8 @@ void FusedSeqpoolCVMGradCUDAKernel(
       }
     }
 
-    int cur_batch_size = in_grad->lod().size() ? in_grad->lod()[0].size() - 1
-                                               : in_grad->dims()[0];
+    auto cur_batch_size = in_grad->lod().size() ? in_grad->lod()[0].size() - 1
+                                                : in_grad->dims()[0];
     if (batch_size == -1) {
       batch_size = cur_batch_size;
     } else {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
index 65b96dc22d8357..345a2af55bd01f 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
@@ -298,7 +298,7 @@ void FusedSeqpoolCVMCUDAKernel(const Context &dev_ctx,
         lods.push_back(i + 1);
       }
     }
-    int cur_batch_size =
+    auto cur_batch_size =
         input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
     if (batch_size == -1) {
       batch_size = cur_batch_size;
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu
index 1a17ede68774c1..e31a11070fb4db 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu
@@ -65,7 +65,7 @@ __global__ void SoftmaxMaskFuseGradGPUKernel(const T* grad_input,
 
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * WARP_SIZE;
+      auto data_index = kOneLoadingCounts * local_idx + ii * WARP_SIZE;
       if (data_index < batch_data) {
         load_data(temp_grad_input,
                   grad_input + i * key_seq_len + ii * warp_size);
@@ -103,7 +103,7 @@ __global__ void SoftmaxMaskFuseGradGPUKernel(const T* grad_input,
     if (i >= local_batches) break;
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto data_index = kOneLoadingCounts * local_idx + ii * warp_size;
       if (data_index < key_seq_len) {
         // compute gradients
         T samples_out[kOneLoadingCounts];
@@ -149,7 +149,7 @@ void FusedSoftmaxMaskGradKernel(const Context& dev_ctx,
   // use 128 threads per block to maximum gpu utilization
   constexpr int threads_per_block = 128;
 
-  int warps_per_block = (threads_per_block / warp_size);
+  auto warps_per_block = (threads_per_block / warp_size);
   int batches_per_block = warps_per_block * batches_per_warp;
   int64_t blocks = batch_count / batches_per_block;
   dim3 threads(warp_size, warps_per_block, 1);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu
index dcedf010bad4b6..490fde17889cce 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu
@@ -72,8 +72,9 @@ __global__ void SoftmaxMaskFuseV1GPUKernel(const T* x_data,
   // might be many batches per warp. compute the index within the batch
   int local_idx = threadIdx.x;
 
-  int x_offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx;
-  int mask_offset = mask_fist_idx * key_seq_len + kOneLoadingCounts * local_idx;
+  auto x_offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx;
+  auto mask_offset =
+      mask_fist_idx * key_seq_len + kOneLoadingCounts * local_idx;
   x_data += x_offset;
   mask_data += mask_offset;
   y_data += x_offset;
@@ -89,10 +90,10 @@ __global__ void SoftmaxMaskFuseV1GPUKernel(const T* x_data,
 
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto data_index = kOneLoadingCounts * local_idx + ii * warp_size;
 
       if (data_index < batch_data) {
-        int itr_idx = i * key_seq_len + ii * warp_size;
+        auto itr_idx = i * key_seq_len + ii * warp_size;
 
         // efficiently load data from global memory
         load_data(temp_data, x_data + itr_idx);
@@ -148,7 +149,7 @@ __global__ void SoftmaxMaskFuseV1GPUKernel(const T* x_data,
     if (i >= local_batches) break;
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int idx = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto idx = kOneLoadingCounts * local_idx + ii * warp_size;
       if (idx < key_seq_len) {
 #pragma unroll
         for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
@@ -234,10 +235,10 @@ __global__ void SoftmaxMaskFuseV2GPUKernel(const T* x_data,
 
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto data_index = kOneLoadingCounts * local_idx + ii * warp_size;
 
       if (data_index < batch_data) {
-        int itr_idx = i * key_seq_len + ii * warp_size;
+        auto itr_idx = i * key_seq_len + ii * warp_size;
 
         // efficiently load data from global memory
         load_data(temp_data, x_data + itr_idx);
@@ -293,7 +294,7 @@ __global__ void SoftmaxMaskFuseV2GPUKernel(const T* x_data,
     if (i >= local_batches) break;
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int idx = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto idx = kOneLoadingCounts * local_idx + ii * warp_size;
       if (idx < key_seq_len) {
 #pragma unroll
         for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
@@ -538,7 +539,7 @@ void FusedSoftmaxMaskKernel(const Context& dev_ctx,
   // use 128 threads per block to maximum gpu utilization
   constexpr int threads_per_block = 128;
 
-  int warps_per_block = (threads_per_block / warp_size);
+  auto warps_per_block = (threads_per_block / warp_size);
   int batches_per_block = warps_per_block * batches_per_warp;
   PADDLE_ENFORCE_EQ(
       query_seq_len % batches_per_block,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
index ddf59e49be0ad5..349ae52a714394 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
@@ -178,7 +178,7 @@ void FusedSoftmaxMaskFuseUpperTriangleGradKernel(const Context& dev_ctx,
   // use 128 threads per block to maximum gpu utilization
   constexpr int threads_per_block = 128;
 
-  int warps_per_block = (threads_per_block / warp_size);
+  auto warps_per_block = (threads_per_block / warp_size);
   int batches_per_block = warps_per_block * batches_per_warp;
   // if we use dim3 blocks(query_seq_len,
   //             (attn_mul_batch + batches_per_block) / batches_per_block,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
index 0a5b7ef202a2de..66babfe14d6e4d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
@@ -190,7 +190,7 @@ void FusedSoftmaxMaskFuseUpperTriangleKernel(const Context& dev_ctx,
   int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
   constexpr int threads_per_block = 128;
 
-  int warps_per_block = (threads_per_block / warp_size);
+  auto warps_per_block = (threads_per_block / warp_size);
   int batches_per_block = warps_per_block * batches_per_warp;
   PADDLE_ENFORCE_EQ(
       query_seq_len % batches_per_block,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
index e4b0f90a8ce542..0e32fca9ec53b0 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
@@ -213,7 +213,7 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin,
       threadIdx.x / 128;  // 0 or 1, two quant blocks per block
   const int in_y_idx = blockIdx.y;
   const int in_x_idx = blockIdx.x * blockDim.x + x_offset;
-  const int src_idx = in_y_idx * cols + in_x_idx;
+  const auto src_idx(in_y_idx * cols + in_x_idx);
 
   // Load data and compute swiGLU activation
   if (in_x_idx < cols / 2) [[likely]] {        // NOLINT
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
index acb3b83bc983f3..e1afd708ce079f 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
@@ -126,16 +126,17 @@ __global__ void masked_multihead_attention_kernel(
   // real batch id
   const int bbi = bi / params.beam_width;
   const int hi = blockIdx.y;
-  const int bhi = bi * params.num_head + hi;
+  const auto bhi(bi * params.num_head + hi);
 
   const int kv_num_head = params.kv_num_head;
   const int num_head_per_group = params.num_head / kv_num_head;
   // hi means the head index in query processed by this cuda thread.
   // kv_bhi means the merged batch and head index in key and value processed by
   // this cuda thread.
-  const int kv_bhi = bi * kv_num_head + hi / num_head_per_group;
+  const auto kv_bhi(bi * kv_num_head + hi / num_head_per_group);
+
+  const auto bbhi(bbi * params.beam_width * params.num_head + hi);
 
-  const int bbhi = bbi * params.beam_width * params.num_head + hi;
   const int tid = threadIdx.x;
 
   const int bi_seq_len_offset = bi * params.max_seq_length;
@@ -153,7 +154,7 @@ __global__ void masked_multihead_attention_kernel(
   int start_seq = 0;
   int end_seq = act_time_step;
   bool is_last_block = (SPLIT == false);
-  int real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1;
+  auto real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1;
   if constexpr (SPLIT) {
     if (split_index >= real_split_each_batch) return;
 
@@ -168,7 +169,8 @@ __global__ void masked_multihead_attention_kernel(
 
   // qkv [B, S=1, num_head + 2 * kv_num_head, head_dim]
   // this hi means the head index in query!
-  int qkv_base_offset = bi * (params.num_head + 2 * kv_num_head) * Dh + hi * Dh;
+  auto qkv_base_offset =
+      bi * (params.num_head + 2 * kv_num_head) * Dh + hi * Dh;
 
   // QK_VEC_SIZE is only used for compute q dot k .
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
@@ -198,9 +200,9 @@ __global__ void masked_multihead_attention_kernel(
   // k has QK_VECS_PER_WARP elements: [Qk_vec, Qk_vec, ..., Qk_vec]
   // per cuda thread read a Qk_vec of q and k and compute q dot k.
   if (tid < QK_VECS_PER_WARP) {
-    int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
-    int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
-    int k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE;
+    auto qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    auto q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
+    auto k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE;
 
     Qk_vec q;
     zero(q);
@@ -246,7 +248,7 @@ __global__ void masked_multihead_attention_kernel(
 
     if (!params.neox_rotary_style) {
       if (params.rotary_emb_dims != 0) {
-        int rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.batch_size * Dh;
         Qk_vec_RoPE cos_emb, sin_emb;
@@ -267,16 +269,16 @@ __global__ void masked_multihead_attention_kernel(
       if (params.rotary_emb_dims != 0) {
         int last_dim = Dh / params.rotary_emb_dims;
         int half_lastdim = last_dim / 2;
-        int rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
+        auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE;
         const float *cos_base = params.rotary_emb;
         const float *sin_base = params.rotary_emb + params.batch_size * Dh;
         int stride = half_lastdim / QK_VEC_SIZE;
         int stride_all_lastdim = 2 * stride;
-        int right_id = tid / stride_all_lastdim * stride_all_lastdim +
-                       (tid + stride) % (stride_all_lastdim);
-        int qk_right_offset = qkv_base_offset + right_id * QK_VEC_SIZE;
-        int q_right_bias_offset = hi * Dh + right_id * QK_VEC_SIZE;
-        int k_right_bias_offset =
+        auto right_id = tid / stride_all_lastdim * stride_all_lastdim +
+                        (tid + stride) % (stride_all_lastdim);
+        auto qk_right_offset = qkv_base_offset + right_id * QK_VEC_SIZE;
+        auto q_right_bias_offset = hi * Dh + right_id * QK_VEC_SIZE;
+        auto k_right_bias_offset =
             hi / num_head_per_group * Dh + right_id * QK_VEC_SIZE;
         Qk_vec q_right;
         zero(q_right);
@@ -346,9 +348,9 @@ __global__ void masked_multihead_attention_kernel(
     if (is_last_block) {
       int co = tid / QK_VECS_IN_16B;
       int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE;
-      int offset = kv_bhi * params.max_seq_length * Dh +
-                   co * params.max_seq_length * QK_ELTS_IN_16B +
-                   act_time_step * QK_ELTS_IN_16B + ci;
+      auto offset = kv_bhi * params.max_seq_length * Dh +
+                    co * params.max_seq_length * QK_ELTS_IN_16B +
+                    act_time_step * QK_ELTS_IN_16B + ci;
       if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
         *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
       }
@@ -395,7 +397,7 @@ __global__ void masked_multihead_attention_kernel(
   constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY;
   constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
 
-  int ko = tid / THREADS_PER_KEY + start_seq;
+  auto ko = tid / THREADS_PER_KEY + start_seq;
   int ki = (tid % THREADS_PER_KEY) * K_VEC_SIZE;
 
   static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD, "");
@@ -412,7 +414,7 @@ __global__ void masked_multihead_attention_kernel(
 
   T *k_cache = &params.cache_kv[kv_bhi * params.max_seq_length * Dh + ki];
   T *k_cache_batch = &params.cache_kv[bbhi * params.max_seq_length * Dh + ki];
-  int ti_end = div_up(curr_seq_section, K_PER_WARP) * K_PER_WARP + start_seq;
+  auto ti_end = div_up(curr_seq_section, K_PER_WARP) * K_PER_WARP + start_seq;
 
   const int *beam_offsets = params.beam_cache_offset
                                 ? &params.beam_cache_offset[bi_seq_len_offset]
@@ -420,15 +422,16 @@ __global__ void masked_multihead_attention_kernel(
 
 #pragma unroll
   for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    const int beam_offset = beam_offsets ? beam_offsets[ti] * params.num_head *
-                                               params.max_seq_length * Dh
-                                         : 0;
+    const auto beam_offset(beam_offsets ? beam_offsets[ti] * params.num_head *
+                                              params.max_seq_length * Dh
+                                        : 0);
+
     K_vec k[K_VECS_PER_THREAD];
     K_vec k_vec_zero;
     zero(k_vec_zero);
 #pragma unroll
     for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * params.max_seq_length + ti;
+      auto jj = ii * params.max_seq_length + ti;
       if (ti < end_seq) {
         if (beam_offset) {
           k[ii] =
@@ -487,7 +490,7 @@ __global__ void masked_multihead_attention_kernel(
 
   qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
 
-  int useful_smem_index =
+  auto useful_smem_index =
       is_last_block ? curr_seq_section : curr_seq_section - 1;
   float sum = 0.f;
   for (int ti = tid; ti <= useful_smem_index; ti += THREADS_PER_BLOCK) {
@@ -527,7 +530,7 @@ __global__ void masked_multihead_attention_kernel(
   // vi means the head_dim index processed by this cuda thread in the value.
   // so this cuda thread compute [1, k] * [k, vi:vi+V_VEC_SIZE] and k starts
   // from vo and increases by a step V_PER_ITER.
-  int vo = tid / THREADS_PER_VALUE + start_seq;
+  auto vo = tid / THREADS_PER_VALUE + start_seq;
   int vi = (tid % THREADS_PER_VALUE) * V_VEC_SIZE;
 
   T *v_cache = &params.cache_kv[params.cache_batch_size * kv_num_head *
@@ -550,10 +553,10 @@ __global__ void masked_multihead_attention_kernel(
   if (Dh == Dh_MAX || vi < Dh) {
 #pragma unroll
     for (int ti = vo; ti < end_seq; ti += V_PER_ITER) {
-      const int beam_offset =
-          beam_offsets
-              ? beam_offsets[ti] * params.num_head * params.max_seq_length * Dh
-              : 0;
+      const auto beam_offset(beam_offsets ? beam_offsets[ti] * params.num_head *
+                                                params.max_seq_length * Dh
+                                          : 0);
+
       V_vec v;
       if (beam_offset) {
         v = *reinterpret_cast<const V_vec *>(
@@ -662,14 +665,15 @@ __global__ void post_process_kernel(Masked_multihead_attention_params<T> params,
   int act_time_step = params.sequence_lengths == nullptr
                           ? params.timestep
                           : params.sequence_lengths[bi];
-  int real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1;
+  auto real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1;
   if (real_split_each_batch <= 1) {
     return;
   }
 
   const int tid = threadIdx.x;
   const int hi = blockIdx.x;
-  const int bhi = (bi * params.num_head + hi);
+  const auto bhi((bi * params.num_head + hi));
+
   const int bhsi = (bi * params.num_head + hi) * params.split_seq;
   extern __shared__ float2 qk_sum_max_smem[];
 
@@ -1018,7 +1022,7 @@ void DispatchWithDtype(const Context &dev_ctx,
   int k_num_head = cache_kv.dims()[2];
   int v_num_head = k_num_head;
   // this num_head means query's head
-  int num_head =
+  auto num_head =
       x.dims()[x.dims().size() - 1] / dim_head - k_num_head - v_num_head;
 
   Masked_multihead_attention_params<T> params;
diff --git a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
index 393128051b561a..ba8b6a169ac96f 100644
--- a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
@@ -105,9 +105,11 @@ __global__ void TransposeQkvKernel(const int H,
 
   const int NH = N * H;
   const int NHS = NH * S;
-  const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3;
-  const int bias_offset = m * NH + n * H;
-  const int out_offset = s * H + n * S * H + b * NHS + m * NHS * B;
+  const auto in_offset(n * H + m * NH + s * 3 * NH + b * NHS * 3);
+
+  const auto bias_offset(m * NH + n * H);
+
+  const auto out_offset(s * H + n * S * H + b * NHS + m * NHS * B);
 
   const int i = threadIdx.x;
   output[out_offset + i] =
@@ -134,7 +136,7 @@ void TransQKVWithBias(const int batch,
                       float *output,
                       gpuStream_t stream) {
   // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
-  int scratch_size = batch * head_num * seq_len * seq_len;
+  auto scratch_size = batch * head_num * seq_len * seq_len;
   const dim3 grid(seq_len, batch, 3);
   // scratch % 4 == 0 to ensure the alignment
   if (head_size % 4 == 0 && scratch_size % 4 == 0) {
@@ -196,7 +198,7 @@ void TransQKVWithBias(const int batch,
                       phi::float16 *output,
                       gpuStream_t stream) {
   // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
-  int scratch_size = batch * head_num * seq_len * seq_len;
+  auto scratch_size = batch * head_num * seq_len * seq_len;
   const dim3 grid(seq_len, batch, 3);
   if (head_size % 2 == 0 && scratch_size % 2 == 0) {
     const int h = head_size / 2;
@@ -302,7 +304,7 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
     temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
     auto *temp_qk_bias = dev_ctx.template Alloc<T>(
         &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
-    int grid = batch * head_number * seq_len;
+    auto grid = batch * head_number * seq_len;
     int block = round_up(seq_len);
     broadcast<<<grid, block, 0, stream>>>(
         bias_qk_d, temp_qk_bias, seq_len, head_number);
@@ -315,14 +317,14 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
     temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
     auto *temp_qk_bias = dev_ctx.template Alloc<T>(
         &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
-    int grid = batch * head_number * seq_len;
+    auto grid = batch * head_number * seq_len;
     int block = round_up(seq_len);
     broadcast_batch_head_number<<<grid, block, 0, stream>>>(
         bias_qk_d, temp_qk_bias, batch, seq_len, head_number);
     bias_qk_d = static_cast<const T *>(temp_qk_bias);
   }
   if (!bias_qk) {
-    int size = batch * head_number * seq_len * seq_len;
+    auto size = batch * head_number * seq_len * seq_len;
     temp_bias_tensor.Resize({size});
     auto *temp_qk_bias = dev_ctx.template Alloc<T>(
         &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
@@ -362,7 +364,7 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
 
   phi::DenseTensor multihead_temp_tensor;
   // B * head_number * S * S * 1 + B * S * 3 * N * H
-  int scratch_size = batch * head_number * seq_len * seq_len * 1;
+  auto scratch_size = batch * head_number * seq_len * seq_len * 1;
   multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
   auto *multihead_temp_data = dev_ctx.template Alloc<T>(
       &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
@@ -408,7 +410,7 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
                            T(0.0));
   }
 
-  int grid = batch * head_number * seq_len;
+  auto grid = batch * head_number * seq_len;
   int block = head_size;
   transpose<T><<<grid, block, 0, stream>>>(
       tptr, output_d, batch, seq_len, head_number, head_size);
diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
index b2d15a59f8b1c9..1700cadaf95ecc 100644
--- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -92,14 +92,14 @@ __global__ void qkv_attention_kernel(QkvUnpackMhaParams<T> params,
   // real batch id
   const int bbi = bi / params.beam_width;
   const int hi = blockIdx.x;
-  const int bhi = bi * params.num_head + hi;
+  const auto bhi(bi * params.num_head + hi);
 
   const int kv_num_head = params.kv_num_head;
   const int num_head_per_group = params.num_head / kv_num_head;
 
-  const int kv_bhi = bi * kv_num_head + hi / num_head_per_group;
+  const auto kv_bhi(bi * kv_num_head + hi / num_head_per_group);
 
-  const int bbhi = bbi * params.beam_width * params.num_head + hi;
+  const auto bbhi(bbi * params.beam_width * params.num_head + hi);
 
   const int tid = threadIdx.x;
 
@@ -108,7 +108,7 @@ __global__ void qkv_attention_kernel(QkvUnpackMhaParams<T> params,
 
   int act_time_step = params.timestep;
 
-  int qkv_base_offset = bi * (params.num_head) * Dh + hi * Dh;
+  auto qkv_base_offset = bi * (params.num_head) * Dh + hi * Dh;
 
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
   static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
@@ -120,9 +120,9 @@ __global__ void qkv_attention_kernel(QkvUnpackMhaParams<T> params,
 
   // load q element to q smem
   if (tid < QK_VECS_PER_WARP) {
-    int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
-    int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
-    int k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE;
+    auto qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    auto q_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
+    auto k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE;
 
     Qk_vec q;
     zero(q);
diff --git a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
index d9dee204f7fc38..ca50fe63fc33aa 100644
--- a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
@@ -73,11 +73,11 @@ class GRUOneDNNHandler
     // Is it int8 kernel
     const bool is_INT8 = std::is_same<T, uint8_t>::value;
     if (is_INT8) {
-      const int weights_scale_mask =
+      const auto weights_scale_mask(
           0 +
           (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
-          +
-          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+          + (1 << 4));
+      // bit, indicating the unique scales for `o` dim in `ldigo`
 
       attr_.set_rnn_data_qparams(scale_data, shift_data);
       attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
diff --git a/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h b/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h
index 95e8900cc439c6..71ec1e859ee46a 100644
--- a/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h
+++ b/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h
@@ -65,11 +65,11 @@ class RNNONEDNNHandler : public phi::funcs::OneDNNHandlerT<T, T_alg> {
     if (is_INT8) {
       // Int8 attributes
 
-      const int weights_scale_mask =
+      const auto weights_scale_mask(
           0 +
           (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
-          +
-          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+          + (1 << 4));
+      // bit, indicating the unique scales for `o` dim in `ldigo`
 
       attr_.set_rnn_data_qparams(scale_data, shift_data);
       attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
diff --git a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
old mode 100755
new mode 100644
index 1a21b7a1b3e562..3bd0a9ebc95a78
--- a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
@@ -69,7 +69,7 @@ void qkv_split_rope_kernel(
   auto k_data = reinterpret_cast<XPUType*>(k_out->data<T>());
   auto v_data = reinterpret_cast<XPUType*>(v_out->data<T>());
   auto qkv_input_data = reinterpret_cast<const XPUType*>(qkv_input.data<T>());
-  int qkv_head = q_num_head + 2 * kv_num_head;
+  auto qkv_head = q_num_head + 2 * kv_num_head;
   int32_t ret;
   ret = baidu::xpu::api::split<XPUType>(xpu_ctx.x_context(),
                                         qkv_input_data,
@@ -195,7 +195,8 @@ void BlockMultiheadAttentionXPUKernel(
   const int kv_num_head = key_cache_dims[1];
   const int dim_head = key_cache_dims[3];
   const int total_num_head = qkv.dims()[qkv.dims().size() - 1] / dim_head;
-  const int q_num_head = total_num_head - 2 * kv_num_head;
+  const auto q_num_head(total_num_head - 2 * kv_num_head);
+
   const int bsz = cum_offsets.dims()[0];
   const int max_block_per_seq = block_tables.dims()[1];
   const int out_row = fmha_out->dims()[0];
diff --git a/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc
index cfbdffb3473f31..650153663d06ac 100644
--- a/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc
@@ -28,7 +28,7 @@ void FillSeqLod(int batch_size,
   for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
     int cur_batch_seq_len = 0;
     for (int seq_idx = 0; seq_idx < max_seq_len; seq_idx++) {
-      int mask_idx = batch_idx * max_seq_len + seq_idx;
+      auto mask_idx = batch_idx * max_seq_len + seq_idx;
       if (mask[mask_idx] > 0) {
         cur_batch_seq_len++;
       } else {
@@ -47,7 +47,7 @@ void FillSeqLod<float>(int batch_size,
   for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
     int cur_batch_seq_len = 0;
     for (int seq_idx = 0; seq_idx < max_seq_len; seq_idx++) {
-      int mask_idx = batch_idx * max_seq_len + seq_idx;
+      auto mask_idx = batch_idx * max_seq_len + seq_idx;
       if (mask[mask_idx] > 1e-7) {
         cur_batch_seq_len++;
       } else {
diff --git a/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu
index 6fdcebde8e6d94..b83b0398380af0 100644
--- a/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu
@@ -65,9 +65,10 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(const T* dy,
     T ds_sum = 0;
     T db_sum = 0;
     for (int64_t j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
+      const auto index(layout == phi::DataLayout::kNCHW
+                           ? (j / HxW * C + i) * HxW + j % HxW
+                           : j * outer_size + i);
+
       ds_sum += dy[index] * x[index];
       db_sum += dy[index];
     }
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 4dbbcb814cee21..74a29561a543b6 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -74,8 +74,8 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx,
     // reduce_dims  = [3] // reduce along the broadcasted axis
     std::vector<int> reduce_dims_vec;
     for (int j = 0; j < in_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
+      auto out_axis = out_rank - j - 1;
+      auto in_axis = in_rank - j - 1;
 
       if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
         reduce_dims_vec.push_back(in_axis);
diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
index cc8fdbb57ff5a7..9a9a0be0917338 100644
--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -416,7 +416,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
   size_t cub_temp_storage_bytes =
       std::max(std::max(cub_sort_temp_store_size, cub_scan_temp_store_size),
                cub_sum_temp_store_size);
-  int num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1;
+  auto num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1;
   PADDLE_ENFORCE_GT(
       (4 * num_buffer_ele + 3 * (nranks + 1) + num_temp_ele),
       0,
diff --git a/paddle/phi/kernels/gpu/correlation_grad_kernel.cu b/paddle/phi/kernels/gpu/correlation_grad_kernel.cu
index 2a1f277d8e77f9..c4eea6654cccb7 100644
--- a/paddle/phi/kernels/gpu/correlation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/correlation_grad_kernel.cu
@@ -47,7 +47,7 @@ __global__ void correlation_backward_input1(int64_t n,
 
   int kernel_rad = (kernel_size - 1) / 2;
   int displacement_rad = max_displacement / stride2;
-  int displacement_size = 2 * displacement_rad + 1;
+  auto displacement_size = 2 * displacement_rad + 1;
 
   int64_t xmin = (w - kernel_rad - max_displacement) / stride1;
   int64_t ymin = (h - kernel_rad - max_displacement) / stride1;
@@ -128,7 +128,7 @@ __global__ void correlation_backward_input2(int64_t n,
 
   int kernel_rad = (kernel_size - 1) / 2;
   int displacement_rad = max_displacement / stride2;
-  int displacement_size = 2 * displacement_rad + 1;
+  auto displacement_size = 2 * displacement_rad + 1;
 
   int64_t p_input_width = input_width + 2 * pad_size;
   int64_t p_input_height = input_height + 2 * pad_size;
@@ -208,8 +208,8 @@ void CorrelationCUDAGradKernel(const Context &dev_ctx,
   int H = in_dims[2];
   int W = in_dims[3];
 
-  int padded_input_height = H + 2 * pad_size;
-  int padded_input_width = W + 2 * pad_size;
+  auto padded_input_height = H + 2 * pad_size;
+  auto padded_input_width = W + 2 * pad_size;
 
   phi::DenseTensor rinput1;
   rinput1.Resize({N, padded_input_height, padded_input_width, C});
diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
index aa0aeb62683164..93500e60d8f72d 100644
--- a/paddle/phi/kernels/gpu/correlation_kernel.cu
+++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
@@ -41,7 +41,7 @@ __global__ void correlation_forward(T *output,
 
   int kernel_rad = (kernel_size - 1) / 2;
   int displacement_rad = max_displacement / stride2;
-  int displacement_size = 2 * displacement_rad + 1;
+  auto displacement_size = 2 * displacement_rad + 1;
 
   int64_t global_block_id = blockIdx.x;
   int64_t hw = (int64_t)OH * OW;
@@ -130,8 +130,8 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
   int H = in_dims[2];
   int W = in_dims[3];
 
-  int padded_input_height = H + 2 * pad_size;
-  int padded_input_width = W + 2 * pad_size;
+  auto padded_input_height = H + 2 * pad_size;
+  auto padded_input_width = W + 2 * pad_size;
 
   phi::DenseTensor rinput1;
   rinput1.Resize({N, padded_input_height, padded_input_width, C});
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index af56951ebcf48a..a96e1afd4f4dae 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -47,7 +47,7 @@ __global__ void SoftLabelCrossEntropyGradientKernel(T* logit_grad,
   if (ids < static_cast<int64_t>(n) * d) {
     int idx_n = ids / d;
     int idx_remain = ids % remain;
-    int idx_loss = idx_n * remain + idx_remain;
+    auto idx_loss = idx_n * remain + idx_remain;
     logit_grad[ids] = loss_grad[idx_loss] * (-labels[ids] / logit_grad[ids]);
   }
 }
@@ -63,7 +63,7 @@ __global__ void HardLabelCrossEntropyGradientKernel(T* logit_grad,
     int idx_n = index / remain;
     int idx_remain = index % remain;
     int tmp = static_cast<int>(labels[index]);
-    int idx = idx_n * d + tmp * remain + idx_remain;
+    auto idx = idx_n * d + tmp * remain + idx_remain;
     if (ignore_index != tmp) {
       logit_grad[idx] = -static_cast<T>(1.) / logit_grad[idx];
     }
@@ -81,7 +81,7 @@ __global__ void ScaleCrossEntropyGradient(T* logit_grad,
   CUDA_KERNEL_LOOP(index, num) {
     int idx_n = index / d;
     int idx_remain = index % remain;
-    int idx_lbl = idx_n * remain + idx_remain;
+    auto idx_lbl = idx_n * remain + idx_remain;
     int k = (index % d) / remain;
     auto lbl = static_cast<int64_t>(labels[idx_lbl]);
     if (lbl == ignore_index || lbl != k) {
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index be2c296a2ff046..7b43d631f57af5 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -755,7 +755,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
 
     // use 128 threads per block to maximimize gpu utilization
     constexpr int threads_per_block = 128;
-    int warps_per_block = (threads_per_block / kWarpSize);
+    auto warps_per_block = (threads_per_block / kWarpSize);
     int batches_per_block = warps_per_block * batches_per_warp;
     int64_t blocks =
         (static_cast<int64_t>(N) + batches_per_block - 1) / batches_per_block;
@@ -1099,7 +1099,7 @@ void SwitchWarpSoftmaxForward(T* loss,
   int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
   int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
   constexpr int threads_per_block = 128;
-  int warps_per_block = (threads_per_block / kWarpSize);
+  auto warps_per_block = (threads_per_block / kWarpSize);
   int batches_per_block = warps_per_block * batches_per_warp;
   int64_t blocks = (static_cast<int64_t>(batch_size) + batches_per_block - 1) /
                    batches_per_block;
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 2edac5eba5d9ef..97c55e3fc436e5 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -228,12 +228,15 @@ __device__ __inline__ void KernelDepthwiseConvNCHW(
     return;
 
   int tmp_1 = idx / output_width;
-  const int w_out = idx - tmp_1 * output_width;
+  const auto w_out(idx - tmp_1 * output_width);
+
   int tmp_2 = tmp_1 / output_height;
-  const int h_out = tmp_1 - tmp_2 * output_height;
+  const auto h_out(tmp_1 - tmp_2 * output_height);
+
   tmp_1 = tmp_2;
   tmp_2 = tmp_1 / output_channels;
-  const int c_out = tmp_1 - tmp_2 * output_channels;
+  const auto c_out(tmp_1 - tmp_2 * output_channels);
+
   const int batch = tmp_2;
 
   const int c_in = c_out / filter_multiplier;
@@ -241,9 +244,9 @@ __device__ __inline__ void KernelDepthwiseConvNCHW(
 
   int in_offset =
       ((batch * input_channels + c_in) * input_height) * input_width;
-  int weight_offset = c_out * filter_height * filter_width;
-  int h_in_start = -padding_height + h_out * stride_height;
-  int w_in_start = -padding_width + w_out * stride_width;
+  auto weight_offset = c_out * filter_height * filter_width;
+  auto h_in_start = -padding_height + h_out * stride_height;
+  auto w_in_start = -padding_width + w_out * stride_width;
 
 #pragma unroll
   for (int fh = 0, h_in = h_in_start; fh < fh_size;
@@ -252,7 +255,7 @@ __device__ __inline__ void KernelDepthwiseConvNCHW(
     for (int fw = 0, w_in = w_in_start; fw < fw_size;
          fw++, w_in += dilate_width) {
       if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) {
-        int offset = in_offset + h_in * input_width + w_in;
+        auto offset = in_offset + h_in * input_width + w_in;
         T in_data = input_data[offset];
         if (fuse_relu_before_conv) {
           value += filter_data[weight_offset] *
@@ -280,20 +283,26 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
   }
 
   int tmp_1 = idx / output_channels;
-  const int c_out = idx - tmp_1 * output_channels;
+  const auto c_out(idx - tmp_1 * output_channels);
+
   int tmp_2 = tmp_1 / output_width;
-  const int w_out = tmp_1 - tmp_2 * output_width;
+  const auto w_out(tmp_1 - tmp_2 * output_width);
+
   tmp_1 = tmp_2;
   tmp_2 = tmp_1 / output_height;
-  const int h_out = tmp_1 - tmp_2 * output_height;
+  const auto h_out(tmp_1 - tmp_2 * output_height);
+
   const int batch = tmp_2;
 
   const int c_in = c_out / filter_multiplier;
   T value(0);
-  const int in_offset =
-      batch * input_height * input_width * input_channels + c_in;
-  const int h_in_start = -padding_height + h_out * stride_height;
-  const int w_in_start = -padding_width + w_out * stride_width;
+  const auto in_offset(batch * input_height * input_width * input_channels +
+                       c_in);
+
+  const auto h_in_start(-padding_height + h_out * stride_height);
+
+  const auto w_in_start(-padding_width + w_out * stride_width);
+
   int weight_offset = 0;
 
 #pragma unroll
@@ -303,7 +312,7 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
     for (int fw = 0, w_in = w_in_start; fw < fw_size;
          ++fw, w_in += dilate_width) {
       if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) {
-        int offset = in_offset + (h_in * input_width + w_in) * input_channels;
+        auto offset = in_offset + (h_in * input_width + w_in) * input_channels;
         T in_data = input_data[offset];
         const T* weight = filter_data + weight_offset * output_channels + c_out;
         if (fuse_relu_before_conv) {
@@ -336,8 +345,9 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
 
       const int c_in = c_out / filter_multiplier;
       T value(0);
-      const int h_in_start = -padding_height + h_out * stride_height;
-      const int w_in_start = -padding_width + w_out * stride_width;
+      const auto h_in_start(-padding_height + h_out * stride_height);
+
+      const auto w_in_start(-padding_width + w_out * stride_width);
 
       int in_offset =
           ((batch * input_channels + c_in) * input_height) * input_width;
@@ -348,7 +358,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
              w_in += dilate_width, w_f++) {
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
-            int offset = in_offset + h_in * input_width + w_in;
+            auto offset = in_offset + h_in * input_width + w_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
                        static_cast<T>(
@@ -359,7 +369,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
           }
         }
       }
-      int index =
+      auto index =
           ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
           w_out;
       output_data[index] = value;
@@ -375,10 +385,11 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
   if (h_out >= output_height) {
     return;
   }
-  int in_offset = batch * input_height * input_width * input_channels;
-  int out_offset =
+  auto in_offset = batch * input_height * input_width * input_channels;
+  auto out_offset =
       (batch * output_height + h_out) * output_width * output_channels;
-  const int h_in_start = -padding_height + h_out * stride_height;
+  const auto h_in_start(-padding_height + h_out * stride_height);
+
   const int wi_size = (output_width + dilate_width - 1) / dilate_width;
   const int kWeightSize = c_filter * c_filter;
   T r_weight[kWeightSize];
@@ -391,20 +402,21 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
     const int c_in = c_out / filter_multiplier;
     for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
       int i_dw = i / wi_size;
-      int i_wi = i - i_dw * wi_size;
-      int w_out = i_wi * dilate_width + i_dw;
+      auto i_wi = i - i_dw * wi_size;
+      auto w_out = i_wi * dilate_width + i_dw;
       if (w_out >= output_width) {
         continue;
       }
       T value(0);
-      const int w_in_start = -padding_width + w_out * stride_width;
+      const auto w_in_start(-padding_width + w_out * stride_width);
+
       for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
            h_in += dilate_height, h_f++) {
         for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
              w_in += dilate_width, w_f++) {
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
-            int offset =
+            auto offset =
                 in_offset + (h_in * input_width + w_in) * input_channels + c_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
@@ -416,7 +428,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
           }
         }
       }
-      int index = out_offset + w_out * output_channels + c_out;
+      auto index = out_offset + w_out * output_channels + c_out;
       output_data[index] = value;
     }
   }
@@ -558,25 +570,28 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNCHW(
   }
 
   int tmp_1 = idx / input_width;
-  const int w_in = idx - tmp_1 * input_width;
+  const auto w_in(idx - tmp_1 * input_width);
+
   int tmp_2 = tmp_1 / input_height;
-  const int h_in = tmp_1 - tmp_2 * input_height;
+  const auto h_in(tmp_1 - tmp_2 * input_height);
+
   tmp_1 = tmp_2;
   tmp_2 = tmp_1 / input_channels;
-  const int c_in = tmp_1 - tmp_2 * input_channels;
+  const auto c_in(tmp_1 - tmp_2 * input_channels);
+
   const int batch = tmp_2;
 
   T value(0);
   for (int c_mul = 0; c_mul < filter_multiplier; ++c_mul) {
-    int c_out = c_in * filter_multiplier + c_mul;
-    int filter_offset = c_out * filter_height * filter_width;
+    auto c_out = c_in * filter_multiplier + c_mul;
+    auto filter_offset = c_out * filter_height * filter_width;
 
 #pragma unroll
     for (int fh = 0; fh < fh_size; ++fh) {
 #pragma unroll
       for (int fw = 0; fw < fw_size; ++fw) {
-        int h_out = h_in + padding_height - fh * dilate_height;
-        int w_out = w_in + padding_width - fw * dilate_width;
+        auto h_out = h_in + padding_height - fh * dilate_height;
+        auto w_out = w_in + padding_width - fw * dilate_width;
         if ((h_out - h_out / stride_height * stride_height == 0) &&
             (w_out - w_out / stride_width * stride_width == 0)) {
           h_out /= stride_height;
@@ -584,7 +599,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNCHW(
 
           if (h_out >= 0 && h_out < output_height && w_out >= 0 &&
               w_out < output_width) {
-            int output_grad_offset =
+            auto output_grad_offset =
                 ((batch * output_channels + c_out) * output_height + h_out) *
                     output_width +
                 w_out;
@@ -610,15 +625,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
 
   for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
     for (int w_in = threadIdx.y; w_in < input_width; w_in += blockDim.y) {
-      int h_out_start =
+      auto h_out_start =
           h_in - (filter_height - 1) * dilate_height + padding_height;
-      int w_out_start =
+      auto w_out_start =
           w_in - (filter_width - 1) * dilate_width + padding_width;
 
       T value(0);
-      int index = ((batch * input_height + h_in) * input_width + w_in) *
-                      input_channels +
-                  c_in;
+      auto index = ((batch * input_height + h_in) * input_width + w_in) *
+                       input_channels +
+                   c_in;
       if (fuse_relu_before_conv) {
         if (input_data[index] <= T(0)) {
           input_grad_data[index] = 0;
@@ -627,7 +642,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
       }
 
       for (int c_i = 0; c_i < filter_multiplier; c_i++) {
-        int c_out = c_in * filter_multiplier + c_i;
+        auto c_out = c_in * filter_multiplier + c_i;
         int weight_offset = filter_height * filter_width;
         for (int h_out = h_out_start, h_f = 0; h_f < filter_height;
              h_out += dilate_height, h_f++) {
@@ -639,11 +654,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset =
+              auto output_grad_offset =
                   ((batch * output_height + s_h_out) * output_width + s_w_out) *
                       output_channels +
                   c_out;
-              int filter_offset = weight_offset * output_channels + c_out;
+              auto filter_offset = weight_offset * output_channels + c_out;
               value += output_grad_data[output_grad_offset] *
                        filter_data[filter_offset];
             }
@@ -661,13 +676,14 @@ template <typename T,
           bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  const auto kWeightSize(c_filter * c_filter * c_filter_multiplier + 1);
+
   T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_in = blockIdx.x;
 
   for (int c_i = 0; c_i < filter_multiplier; c_i++) {
-    int c_out = c_in * filter_multiplier + c_i;
+    auto c_out = c_in * filter_multiplier + c_i;
     const T* weight = filter_data + c_out * c_filter * c_filter;
     for (int i = 0; i < c_filter * c_filter; i++)
       r_weight[i + c_i * c_filter * c_filter] =
@@ -676,11 +692,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
 
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
-      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+      auto h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
+      auto w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value(0);
-      int index =
+      auto index =
           ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
           w_in;
       if (fuse_relu_before_conv) {
@@ -691,7 +707,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
       }
 
       for (int c_i = 0; c_i < filter_multiplier; c_i++) {
-        int c_out = c_in * filter_multiplier + c_i;
+        auto c_out = c_in * filter_multiplier + c_i;
         for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
              h_out += dilate_height, h_f++) {
           for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
@@ -701,7 +717,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset =
+              auto output_grad_offset =
                   ((batch * output_channels + c_out) * output_height +
                    s_h_out) *
                       output_width +
@@ -728,16 +744,17 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
   if (h_in >= input_height) {
     return;
   }
-  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  const auto kWeightSize(c_filter * c_filter * c_filter_multiplier + 1);
+
   T r_weight[kWeightSize];
   const int batch = blockIdx.z;
   const int wi_size = (input_width + dilate_width - 1) / dilate_width;
-  const int h_out_start =
-      h_in - (c_filter - 1) * dilate_height + padding_height;
+  const auto h_out_start(h_in - (c_filter - 1) * dilate_height +
+                         padding_height);
 
   for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
     for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
-      int c_out = c_in * c_filter_multiplier + c_i;
+      auto c_out = c_in * c_filter_multiplier + c_i;
       for (int i = 0; i < c_filter * c_filter; i++)
         r_weight[i + c_i * c_filter * c_filter] =
             filter_data[(c_filter * c_filter - i - 1) * output_channels +
@@ -745,17 +762,17 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
     }
     for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
       int i_dw = i / wi_size;
-      int i_wi = i - i_dw * wi_size;
-      int w_in = i_wi * dilate_width + i_dw;
+      auto i_wi = i - i_dw * wi_size;
+      auto w_in = i_wi * dilate_width + i_dw;
       if (w_in >= input_width) {
         continue;
       }
-      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+      auto w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value(0);
-      int index = ((batch * input_height + h_in) * input_width + w_in) *
-                      input_channels +
-                  c_in;
+      auto index = ((batch * input_height + h_in) * input_width + w_in) *
+                       input_channels +
+                   c_in;
       if (fuse_relu_before_conv) {
         if (input_data[index] <= T(0)) {
           input_grad_data[index] = 0;
@@ -764,7 +781,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
       }
 
       for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
-        int c_out = c_in * c_filter_multiplier + c_i;
+        auto c_out = c_in * c_filter_multiplier + c_i;
         for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
              h_out += dilate_height, h_f++) {
           for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
@@ -774,7 +791,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset =
+              auto output_grad_offset =
                   ((batch * output_height + s_h_out) * output_width + s_w_out) *
                       output_channels +
                   c_out;
@@ -941,22 +958,23 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
 
   const int ohw = output_height * output_width;
   const int onhw = num * ohw;
-  const int h_offset = kh_id * dilate_height - padding_height;
-  const int w_offset = kw_id * dilate_width - padding_width;
+  const auto h_offset(kh_id * dilate_height - padding_height);
+
+  const auto w_offset(kw_id * dilate_width - padding_width);
 
   if (loop_batch) {
     for (int og_w = threadIdx.x; og_w < output_width; og_w += blockDim.x) {
       for (int bid = 0; bid < num; ++bid) {
         for (int og_h = threadIdx.y; og_h < output_height; og_h += blockDim.y) {
-          int i_h = og_h * stride_height + h_offset;
-          int i_w = og_w * stride_width + w_offset;
+          auto i_h = og_h * stride_height + h_offset;
+          auto i_w = og_w * stride_width + w_offset;
 
           if (i_w >= 0 && i_w < input_width && i_h >= 0 && i_h < input_height) {
-            int input_offset =
+            auto input_offset =
                 ((bid * input_channels + ic_id) * input_height + i_h) *
                     input_width +
                 i_w;
-            int output_grad_offset =
+            auto output_grad_offset =
                 ((bid * output_channels + oc_id) * output_height + og_h) *
                     output_width +
                 og_w;
@@ -976,19 +994,19 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
   } else {
     for (int id = threadIdx.x; id < onhw; id += blockDim.x) {
       int bid = id / ohw;
-      int og_hw = id - bid * ohw;
+      auto og_hw = id - bid * ohw;
       int og_h = og_hw / output_width;
-      int og_w = og_hw - og_h * output_width;
+      auto og_w = og_hw - og_h * output_width;
 
-      int i_h = og_h * stride_height + h_offset;
-      int i_w = og_w * stride_width + w_offset;
+      auto i_h = og_h * stride_height + h_offset;
+      auto i_w = og_w * stride_width + w_offset;
 
       if (i_w >= 0 && i_w < input_width && i_h >= 0 && i_h < input_height) {
-        int input_offset =
+        auto input_offset =
             ((bid * input_channels + ic_id) * input_height + i_h) *
                 input_width +
             i_w;
-        int output_grad_offset = (bid * output_channels + oc_id) * ohw + og_hw;
+        auto output_grad_offset = (bid * output_channels + oc_id) * ohw + og_hw;
         if (fuse_relu_before_conv) {
           f_grad += output_grad_data[output_grad_offset] *
                     static_cast<T>(max(
@@ -1114,22 +1132,22 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
   for (int kernel_id = threadIdx.x; kernel_id < output_channels;
        kernel_id += blockDim.x) {
     T s(0);
-    int gbid =
+    auto gbid =
         ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw;
     for (int image_w = threadIdx.y; image_w < output_width;
          image_w += blockDim.y) {
-      int kernel_h = kernel_ih * dilate_height - padding_height;
-      int kernel_w = kernel_iw * dilate_width - padding_width;
+      auto kernel_h = kernel_ih * dilate_height - padding_height;
+      auto kernel_w = kernel_iw * dilate_width - padding_width;
 
-      int image_hk = image_h * stride_height + kernel_h;
-      int image_wk = image_w * stride_width + kernel_w;
+      auto image_hk = image_h * stride_height + kernel_h;
+      auto image_wk = image_w * stride_width + kernel_w;
       if (image_hk < 0 || image_hk >= input_height) continue;
       if (image_wk < 0 || image_wk >= input_width) continue;
-      int input_id =
+      auto input_id =
           ((bid * input_height + image_hk) * input_width + image_wk) *
               input_channels +
           kernel_id / filter_multiplier;
-      int output_id =
+      auto output_id =
           ((bid * output_height + image_h) * output_width + image_w) *
               output_channels +
           kernel_id;
@@ -1141,7 +1159,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
         s += output_grad_data[output_id] * input_data[input_id];
       }
     }
-    const int numel = output_channels * filter_width * filter_height;
+    const auto numel(output_channels * filter_width * filter_height);
+
     NoReturnAtomicAdd(filter_grad_data, gbid, numel, s);
   }
 }
@@ -1183,24 +1202,24 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
     }
     for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
       int i_dw = i / wi_size;
-      int i_wi = i - i_dw * wi_size;
-      int image_w = i_wi * dilate_width + i_dw;
+      auto i_wi = i - i_dw * wi_size;
+      auto image_w = i_wi * dilate_width + i_dw;
       if (image_w >= output_width) {
         continue;
       }
       for (int kernel_ih = 0; kernel_ih < c_filter; ++kernel_ih) {
         for (int kernel_iw = 0; kernel_iw < c_filter; ++kernel_iw) {
-          int kernel_h = kernel_ih * dilate_height - padding_height;
-          int kernel_w = kernel_iw * dilate_width - padding_width;
-          int image_hk = image_h * stride_height + kernel_h;
-          int image_wk = image_w * stride_width + kernel_w;
+          auto kernel_h = kernel_ih * dilate_height - padding_height;
+          auto kernel_w = kernel_iw * dilate_width - padding_width;
+          auto image_hk = image_h * stride_height + kernel_h;
+          auto image_wk = image_w * stride_width + kernel_w;
           if (image_hk < 0 || image_hk >= input_height) continue;
           if (image_wk < 0 || image_wk >= input_width) continue;
-          int input_id =
+          auto input_id =
               ((bid * input_height + image_hk) * input_width + image_wk) *
                   input_channels +
               kernel_id / filter_multiplier;
-          int output_id =
+          auto output_id =
               ((bid * output_height + image_h) * output_width + image_w) *
                   output_channels +
               kernel_id;
@@ -1249,32 +1268,35 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterSmallChannelNHWC(
     T* filter_grad_data) {
   const int bid = blockIdx.y;
   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int numel = output_channels * c_filter * c_filter;
+  const auto numel(output_channels * c_filter * c_filter);
+
   if (idx >= numel) {
     return;
   }
   const int tmp = idx / output_channels;
-  const int kernel_id = idx - tmp * output_channels;
+  const auto kernel_id(idx - tmp * output_channels);
+
   const int kernel_ih = tmp / c_filter;
-  const int kernel_iw = tmp - kernel_ih * c_filter;
+  const auto kernel_iw(tmp - kernel_ih * c_filter);
 
-  const int h_offset = kernel_ih * dilate_height - padding_height;
-  const int w_offset = kernel_iw * dilate_width - padding_width;
+  const auto h_offset(kernel_ih * dilate_height - padding_height);
+
+  const auto w_offset(kernel_iw * dilate_width - padding_width);
 
   T s(0);
   for (int og_h = 0; og_h < output_height; ++og_h) {
     for (int og_w = 0; og_w < output_width; ++og_w) {
-      int image_hk = og_h * stride_height + h_offset;
-      int image_wk = og_w * stride_width + w_offset;
+      auto image_hk = og_h * stride_height + h_offset;
+      auto image_wk = og_w * stride_width + w_offset;
       if (image_hk >= 0 && image_hk < input_height && image_wk >= 0 &&
           image_wk < input_width) {
-        int input_id =
+        auto input_id =
             ((bid * input_height + image_hk) * input_width + image_wk) *
                 input_channels +
             kernel_id / filter_multiplier;
-        int output_id = ((bid * output_height + og_h) * output_width + og_w) *
-                            output_channels +
-                        kernel_id;
+        auto output_id = ((bid * output_height + og_h) * output_width + og_w) *
+                             output_channels +
+                         kernel_id;
         if (fuse_relu_before_conv) {
           s += output_grad_data[output_id] *
                static_cast<T>(
@@ -1852,7 +1874,8 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
       threads = dim3(std::min(output_channels, block_size), blocks, 1);
 
       if (output_channels < SMALL_THRESHOLD) {
-        const int hwc_size = ksize_height * ksize_width * output_channels;
+        const auto hwc_size(ksize_height * ksize_width * output_channels);
+
         grid = dim3((hwc_size + block_size - 1) / block_size, batch_size, 1);
         threads = dim3(std::min(hwc_size, block_size));
       }
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
index 877a61fc902bee..6ad2587d68ed69 100644
--- a/paddle/phi/kernels/gpu/determinant_kernel.cu
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -112,7 +112,7 @@ __global__ void GetDetFromLUComplex(const T* lu_data,
                                     T* out_data) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size) {
-    int offset_lu = idx * n * n;
+    auto offset_lu = idx * n * n;
     int offset_ipiv = idx * n;
     T out_idx = T(1.0, 0.0);
     T negative = T(-1.0, 0.0);
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 65fe2831164b51..c552de4f1bba04 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -98,7 +98,7 @@ void DistributeFpnProposalsKernel(
     std::vector<DenseTensor*> multi_fpn_rois,
     std::vector<DenseTensor*> multi_level_rois_num,
     DenseTensor* restore_index) {
-  int num_level = max_level - min_level + 1;
+  auto num_level = max_level - min_level + 1;
 
   // check that the fpn_rois is not empty
   if (!rois_num.get_ptr()) {
diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
index 2e2f3dd127e9e4..e75f7203127c30 100644
--- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu
+++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -53,7 +53,7 @@ __global__ void Levenshtein(T* dist,
                             const int start) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   int offset = N;
-  int index = start + idx * offset;
+  auto index = start + idx * offset;
   int row = index / (N + 1);
   int col = index % (N + 1);
   if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
@@ -171,12 +171,12 @@ void EditDistanceKernel(const Context& dev_ctx,
 
       // Compute the elements of distance matrix in the anti-diagonal direction
       for (int64_t slice = 2; slice < m + n + 1; ++slice) {
-        int z_m = slice < m + 1 ? 0 : slice - m;
-        int z_n = slice < n + 1 ? 0 : slice - n;
-        int size = slice - (z_m + z_n) + 1;  // number of elements in the same
-                                             // anti-diagonal line to update
+        auto z_m = slice < m + 1 ? 0 : slice - m;
+        auto z_n = slice < n + 1 ? 0 : slice - n;
+        auto size = slice - (z_m + z_n) + 1;  // number of elements in the same
+                                              // anti-diagonal line to update
         // the start index at which computes from
-        int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
+        auto start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
         Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
                          PADDLE_CUDA_NUM_THREADS,
                          0,
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index f3a2874b9234f5..d486c1359dd8bf 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -115,7 +115,7 @@ void GetGradXOrYOut(const GPUContext &dev_ctx,
 template <typename T>
 static __global__ void SimpleElemwiseAddGradCUDAKernel(
     const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  auto tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
   int stride = GRID_NUM_X * BLOCK_NUM_X;
   int loop = size / vec_size;
   int remainder = size % vec_size;
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu
index 7f3a10a13efea9..d248ac8e533c09 100644
--- a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu
@@ -212,9 +212,10 @@ void FlashAttnV3BaseKernel(
   }
 
   auto const sizes = q.dims();
-  const int batch_size = !is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1;
+  const auto batch_size(!is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1);
+
   int seqlen_q = !is_varlen_q ? sizes[1] : max_seqlen_q_;
-  int total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0];
+  auto total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0];
   int num_heads = q.dims()[q.dims().size() - 2];
   int const head_size = q.dims()[q.dims().size() - 1];
   int const head_size_v = v.dims()[v.dims().size() - 1];
@@ -522,7 +523,7 @@ void FlashAttnV3BaseKernel(
     // We don't need max_seqlen_k_new, so seqlen_k_new can be whatever when
     // is_varlen_k_new
     int seqlen_k_new = !is_varlen_k_new ? k_new.dims()[1] : 0;
-    int total_k_new =
+    auto total_k_new =
         !is_varlen_k_new ? batch_size * k_new.dims()[1] : k_new.dims()[0];
     if (!is_varlen_k_new) {
       CHECK_SHAPE(k_new, batch_size, seqlen_k_new, num_heads_k, head_size);
@@ -601,8 +602,8 @@ void FlashAttnV3BaseKernel(
                         : ((params_is_causal && !is_varlen) ||
                            (is_varlen && params_num_splits > 1));
   if (scheduler_needs_semaphore || use_dynamic_split) {
-    int metadata_size = static_cast<int>(scheduler_needs_semaphore) +
-                        static_cast<int>(use_dynamic_split) * params_b;
+    auto metadata_size = static_cast<int>(scheduler_needs_semaphore) +
+                         static_cast<int>(use_dynamic_split) * params_b;
     phi::dynload::fa3_fwd_params_set_skip_scheduler_metadata_computation(
         params_handle, scheduler_metadata_.is_initialized());
     if (scheduler_metadata_.is_initialized()) {
@@ -1372,9 +1373,10 @@ void FlashMaskV2BaseKernel(
   }
 
   auto const sizes = q.dims();
-  const int batch_size = !is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1;
+  const auto batch_size(!is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1);
+
   int seqlen_q = !is_varlen_q ? sizes[1] : max_seqlen_q_;
-  int total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0];
+  auto total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0];
   int num_heads = q.dims()[q.dims().size() - 2];
   int const head_size = q.dims()[q.dims().size() - 1];
   int const head_size_v = v.dims()[v.dims().size() - 1];
@@ -1684,7 +1686,7 @@ void FlashMaskV2BaseKernel(
     // We don't need max_seqlen_k_new, so seqlen_k_new can be whatever when
     // is_varlen_k_new
     int seqlen_k_new = !is_varlen_k_new ? k_new.dims()[1] : 0;
-    int total_k_new =
+    auto total_k_new =
         !is_varlen_k_new ? batch_size * k_new.dims()[1] : k_new.dims()[0];
     if (!is_varlen_k_new) {
       CHECK_SHAPE(k_new, batch_size, seqlen_k_new, num_heads_k, head_size);
@@ -1768,8 +1770,8 @@ void FlashMaskV2BaseKernel(
                         : ((params_is_causal && !is_varlen) ||
                            (is_varlen && params_num_splits > 1));
   if (scheduler_needs_semaphore || use_dynamic_split) {
-    int metadata_size = static_cast<int>(scheduler_needs_semaphore) +
-                        static_cast<int>(use_dynamic_split) * params_b;
+    auto metadata_size = static_cast<int>(scheduler_needs_semaphore) +
+                         static_cast<int>(use_dynamic_split) * params_b;
     phi::dynload::
         flashmaskv2_fwd_params_set_skip_scheduler_metadata_computation(
             params_handle, scheduler_metadata_.is_initialized());
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu b/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu
index 346e329f7d9d4d..3c5cd3f4ecde72 100644
--- a/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu
@@ -220,7 +220,7 @@ void set_params_fprop(Flash_fwd_params *params_handle,
   dynload::fa3_fwd_params_set_window_size_right(params_handle,
                                                 window_size_right);
 
-  int arch = dprops.major * 10 + dprops.minor;
+  auto arch = dprops.major * 10 + dprops.minor;
   int num_sm = dprops.multiProcessorCount - sm_margin;
 
   dynload::fa3_fwd_params_set_arch(params_handle, arch);
@@ -498,7 +498,7 @@ void set_flashmaskv2_params_fprop(Flash_fwd_params *params_handle,
   dynload::flashmaskv2_fwd_params_set_window_size_right(params_handle,
                                                         window_size_right);
 
-  int arch = dprops.major * 10 + dprops.minor;
+  auto arch = dprops.major * 10 + dprops.minor;
   int num_sm = dprops.multiProcessorCount - sm_margin;
 
   dynload::flashmaskv2_fwd_params_set_arch(params_handle, arch);
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index d96cde7884de70..abe41249d3c3b6 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -210,7 +210,7 @@ static __global__ void FilterBBoxes(const T *bboxes,
     }
     __syncthreads();
     if (threadIdx.x == 0) {
-      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      auto size = (num - i) < BlockSize ? num - i : BlockSize;
       for (int j = 0; j < size; ++j) {
         if (keep_index[j] > -1) {
           keep[cnt++] = keep_index[j];
diff --git a/paddle/phi/kernels/gpu/global_gather_kernel.cu b/paddle/phi/kernels/gpu/global_gather_kernel.cu
index c2efdc5af22204..1825502b3b6bc7 100644
--- a/paddle/phi/kernels/gpu/global_gather_kernel.cu
+++ b/paddle/phi/kernels/gpu/global_gather_kernel.cu
@@ -114,7 +114,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     for (auto i = 0; i < n_expert; ++i) {
       comm_ctx->GroupStart();
       for (auto j = 0; j < nranks; ++j) {
-        int idx = i + j * n_expert;
+        auto idx = i + j * n_expert;
         if (cpu_global_count_data[idx]) {
           auto send_buf = distributed::GetPartialTensor(
               *x, send_ptr * in_feat, cpu_global_count_data[idx] * in_feat);
diff --git a/paddle/phi/kernels/gpu/global_scatter_kernel.cu b/paddle/phi/kernels/gpu/global_scatter_kernel.cu
index 752b2aacf7e882..ff3e1817d9d4e8 100644
--- a/paddle/phi/kernels/gpu/global_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/global_scatter_kernel.cu
@@ -114,7 +114,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     for (auto i = 0; i < n_expert; ++i) {
       comm_ctx->GroupStart();
       for (auto j = 0; j < nranks; ++j) {
-        int idx = i + j * n_expert;
+        auto idx = i + j * n_expert;
         if (cpu_local_count_data[idx]) {
           auto send_buf = distributed::GetPartialTensor(
               *x,
diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
index 18eefe68f2033e..d47d9ea71f3aed 100644
--- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
@@ -388,7 +388,7 @@ void GroupNormGradKernel(const Context& dev_ctx,
             groups,
             std::min(max_grid_z, x_dims[0]));
   dim3 threads(block_size, 1, 1);
-  int flags =
+  auto flags =
       (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
   if (data_layout == DataLayout::kNCHW) {
     const int max_num_threads = 1024;
@@ -486,7 +486,7 @@ void GroupNormGradKernel(const Context& dev_ctx,
     set_zero_AccT(dev_ctx, &temp_mean, static_cast<AccT>(0));
     auto* temp_mean_data = temp_mean.data<AccT>();
 
-    int flags =
+    auto flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
     UNROLL_ALL_CASES(flags,
                      GroupNormBackwardGetMeanAndVar,
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index dcedf1873286a3..9fb15881846782 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -825,7 +825,7 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
   params_.eps = epsilon;
   auto stream = dev_ctx.stream();
   DenseTensor redBuffer;
-  int buffer_sizes = 2 * params_.n * groups;
+  auto buffer_sizes = 2 * params_.n * groups;
   redBuffer.Resize({1, buffer_sizes});
   params_.redBuffer = dev_ctx.template Alloc<float>(&redBuffer);
   int64_t max_grid_x = dev_ctx.GetCUDAMaxGridDimSize()[0];
@@ -1201,7 +1201,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx,
                                                  mean_data,
                                                  temp_var_data);
   }
-  int flags =
+  auto flags =
       (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
   UNROLL_ALL_CASES(flags,
                    GroupNormForward,
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
index 865ab91da7b1b3..0e490734edaa15 100644
--- a/paddle/phi/kernels/gpu/instance_norm_utils.h
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -58,7 +58,8 @@ static __global__ void add_param(const T *input,
   for (int i = blockIdx.x; i < C; i += gridDim.x) {
     MPType ou = static_cast<MPType>(0);
     for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
-      const int index = j * C + i;
+      const auto index(j * C + i);
+
       ou = ou + static_cast<MPType>(input[index]);
     }
     ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index ed5f6438ab0c49..8d812a08f7bacb 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -79,7 +79,7 @@ struct ThreadAssigner<1> {
                                    const int cols_per_thread,
                                    int *last_tid_idx) {
     int cols_this_thread = cols_per_thread;
-    int last_tid = (cols / cols_per_thread);
+    auto last_tid = (cols / cols_per_thread);
     *last_tid_idx = last_tid;
     if (threadIdx.x == last_tid) {
       cols_this_thread = cols - cols_per_thread * last_tid;
diff --git a/paddle/phi/kernels/gpu/lrn_grad_kernel.cu b/paddle/phi/kernels/gpu/lrn_grad_kernel.cu
index e582bb7b6cbb91..aeba1c2da91df1 100644
--- a/paddle/phi/kernels/gpu/lrn_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lrn_grad_kernel.cu
@@ -34,7 +34,8 @@ __global__ void KeCMRNormDiff(int img_size,
   if (idx < img_size) {
     const int w = idx % W;
     const int h = (idx / W) % H;
-    const int n = idx / W / H;
+    const auto n(idx / W / H);
+
     const int offset =
         (data_layout != DataLayout::kNHWC ? (n * C * H + h) * W + w
                                           : ((n * H + h) * W + w) * C);
@@ -45,8 +46,9 @@ __global__ void KeCMRNormDiff(int img_size,
     x_g += offset;
 
     const int step = H * W;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
+    const auto pre_pad(size - (size + 1) / 2);
+
+    const auto post_pad(size - pre_pad - 1);
 
     int index = 0;
     T accum = 0;
@@ -88,7 +90,7 @@ void CrossMapNormalGrad(const phi::GPUContext& dev_ctx,
                         T alpha,
                         T beta,
                         const DataLayout data_layout) {
-  int img_size = N * H * W;
+  auto img_size = N * H * W;
 
   const int block_size = 1024;
   int grid_size = (img_size + block_size - 1) / block_size;
diff --git a/paddle/phi/kernels/gpu/lrn_kernel.cu b/paddle/phi/kernels/gpu/lrn_kernel.cu
index 41c9febf733942..ebf50dd00eb774 100644
--- a/paddle/phi/kernels/gpu/lrn_kernel.cu
+++ b/paddle/phi/kernels/gpu/lrn_kernel.cu
@@ -31,7 +31,8 @@ __global__ void KeCMRNormFillScale(int img_size,
   if (idx < img_size) {
     const int w = idx % W;
     const int h = (idx / W) % H;
-    const int n = idx / W / H;
+    const auto n(idx / W / H);
+
     const int offset =
         (data_layout != DataLayout::kNHWC ? (n * C * H + h) * W + w
                                           : ((n * H + h) * W + w) * C);
@@ -40,7 +41,7 @@ __global__ void KeCMRNormFillScale(int img_size,
     mid += offset;
     const int step = H * W;
     const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
+    const auto post_pad(size - pre_pad - 1);
 
     T accum = 0;
     int index = 0;
@@ -90,14 +91,14 @@ void CrossMapNormal(const phi::GPUContext& dev_ctx,
                     T alpha,
                     T beta,
                     const DataLayout data_layout) {
-  int img_size = N * H * W;
+  auto img_size = N * H * W;
   const int block_size = 1024;
   int grid_size = (img_size + block_size - 1) / block_size;
 
   KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
       img_size, inputs, mid, C, H, W, n, k, alpha, data_layout);
 
-  int input_size = N * H * W * C;
+  auto input_size = N * H * W * C;
   grid_size = (input_size + block_size - 1) / block_size;
   KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
       input_size, inputs, mid, -beta, outputs);
diff --git a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
index 5184edec460c6e..6a43fea5a344d8 100644
--- a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
@@ -98,7 +98,8 @@ size_t CalcSortScoresPerClassWorkspaceSize(const int num,
                                            const int num_classes,
                                            const int num_preds_per_class) {
   size_t wss[4];
-  const int array_len = num * num_classes * num_preds_per_class;
+  const auto array_len(num * num_classes * num_preds_per_class);
+
   wss[0] = array_len * sizeof(T);                  // temp scores
   wss[1] = array_len * sizeof(int);                // temp indices
   wss[2] = (num * num_classes + 1) * sizeof(int);  // offsets
@@ -180,7 +181,8 @@ __launch_bounds__(nthds_per_cta) __global__
   if (cur_idx < num_preds_per_batch) {
     const int class_idx = cur_idx / num_preds_per_class;
     for (int i = 0; i < num; i++) {
-      const int target_idx = i * num_preds_per_batch + cur_idx;
+      const auto target_idx(i * num_preds_per_batch + cur_idx);
+
       const T_SCORE score = conf_scores_gpu[target_idx];
 
       // "Clear" background labeled score and index
@@ -227,7 +229,8 @@ __launch_bounds__(nthds_per_cta) __global__
       }
 
       if ((cur_idx % num_preds_per_class) == 0) {
-        const int offset_ct = i * num_classes + cur_idx / num_preds_per_class;
+        const auto offset_ct(i * num_classes + cur_idx / num_preds_per_class);
+
         d_offsets[offset_ct] = offset_ct * num_preds_per_class;
         // set the last element in d_offset
         if (blockIdx.x == 0 && threadIdx.x == 0)
@@ -251,7 +254,8 @@ void SortScoresPerClassGPU(gpuStream_t stream,
                            const float score_shift) {
   const int num_segments = num * num_classes;
   void* temp_scores = workspace;
-  const int array_len = num * num_classes * num_preds_per_class;
+  const auto array_len(num * num_classes * num_preds_per_class);
+
   void* temp_idx = GetNextWorkspacePtr(reinterpret_cast<int8_t*>(temp_scores),
                                        array_len * sizeof(T_SCORE));
   void* d_offsets = GetNextWorkspacePtr(reinterpret_cast<int8_t*>(temp_idx),
@@ -677,7 +681,8 @@ __launch_bounds__(nthds_per_cta) __global__
        i += gridDim.x * nthds_per_cta) {
     const int imgId = i / keep_top_k;
     const int detId = i % keep_top_k;
-    const int offset = imgId * num_classes * top_k;
+    const auto offset(imgId * num_classes * top_k);
+
     const int index = indices[offset + detId];
     const T_SCORE score = scores[offset + detId];
     if (index == -1) {
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 34c4a1391e3dfe..43c443de190341 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -70,7 +70,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs_data,
   int right = num_categories;
 
   while (right - left > 0) {
-    int mid = left + (right - left) / 2;
+    auto mid = left + (right - left) / 2;
 
     T temp_prob = cumulative_probs_data[mid];
     if (temp_prob < rng_number) {
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 6df5941a1b794e..b41e3935c623b2 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -66,7 +66,8 @@ __global__ void Normalize(const T* x,
     }
     __syncthreads();
     for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
-      const int index = base + j * post;
+      const auto index(base + j * post);
+
       y[index] = static_cast<T>((static_cast<MT>(x[index]) / norm));
     }
   }
diff --git a/paddle/phi/kernels/gpu/prior_box_kernel.cu b/paddle/phi/kernels/gpu/prior_box_kernel.cu
index 73049ae7572dc0..03e152e5078991 100644
--- a/paddle/phi/kernels/gpu/prior_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/prior_box_kernel.cu
@@ -46,13 +46,13 @@ __global__ void GenPriorBox(T* out,
                             const int min_num,
                             bool is_clip,
                             bool min_max_aspect_ratios_order) {
-  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
-  int box_num = height * width * num_priors;
+  auto num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  auto box_num = height * width * num_priors;
   CUDA_KERNEL_LOOP(i, box_num) {
     int h = i / (num_priors * width);
     int w = (i / num_priors) % width;
     int p = i % num_priors;
-    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    auto m = max_sizes ? p / (as_num + 1) : p / as_num;
     T cx = (w + offset) * step_width;
     T cy = (h + offset) * step_height;
     T bw, bh;
@@ -158,7 +158,7 @@ void PriorBoxKernel(const Context& dev_ctx,
     num_priors += max_sizes.size();
   }
   int min_num = static_cast<int>(min_sizes.size());
-  int box_num = width * height * num_priors;
+  auto box_num = width * height * num_priors;
 
   int block = 512;
   int grid = (box_num + block - 1) / block;
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
index 13f0b12fa7e0d7..9f964ee1bce719 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -97,7 +97,7 @@ __global__ void GPUPSROIPoolBackward(const int64_t nthreads,
     T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
     for (int ih = hstart; ih < hend; ++ih) {
       for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
+        auto input_index = ih * width + iw;
         phi::CudaAtomicAdd(offset_dx_data + input_index, diff_val);
       }
     }
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
index 1193c18131ce33..9a2065812c07b5 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -52,7 +52,7 @@ __global__ void GPUPSROIPoolForward(const int nthreads,
     int pw = i % pooled_width;
     int ph = (i / pooled_width) % pooled_height;
     int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
+    auto n = i / pooled_width / pooled_height / output_channels;
 
     // set roi_batch_id
     int roi_batch_id = rois_batch_id_data[n];
@@ -86,7 +86,7 @@ __global__ void GPUPSROIPoolForward(const int nthreads,
     wend = min(max(wend, 0), width);
     bool is_empty = (hend <= hstart) || (wend <= wstart);
 
-    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    auto input_channel = (c * pooled_height + ph) * pooled_width + pw;
     const T* offset_input_data =
         input_data +
         (roi_batch_id * input_channels + input_channel) * height * width;
@@ -94,7 +94,7 @@ __global__ void GPUPSROIPoolForward(const int nthreads,
 
     for (int ih = hstart; ih < hend; ++ih) {
       for (int iw = wstart; iw < wend; ++iw) {
-        int input_index = ih * width + iw;
+        auto input_index = ih * width + iw;
         outsum += offset_input_data[input_index];
       }
     }
diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
index 7144d89c72660e..6e6c76e27d1325 100644
--- a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
+++ b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
@@ -236,7 +236,7 @@ void RepeatInterleaveKernel(const Context& dev_ctx,
   }
   // Get actual dimension
   const int ndim = x.dims().size();
-  const int target_dim = (dim < 0) ? ndim + dim : dim;
+  const auto target_dim((dim < 0) ? ndim + dim : dim);
 
   // Calculate sizes
   int64_t outer_size = 1;
diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
index 20015f7b875952..292d52c80fb7f9 100644
--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
@@ -59,9 +59,11 @@ void HostRMSNormGradient(const Context& dev_ctx,
     const int part_size = 16;
     const dim3 threads2(32, 4, 1);
     const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
-    const int nshared2_a =
-        2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
-    const int nshared2_b = threads2.x * threads2.y * sizeof(U);
+    const auto nshared2_a(2 * sizeof(U) * threads2.y * threads2.y *
+                          (threads2.x + 1));
+
+    const auto nshared2_b(threads2.x * threads2.y * sizeof(U));
+
     const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
     std::vector<int64_t> shape = {part_size, n2};
     DenseTensor part_grad_gamma(
@@ -84,7 +86,8 @@ void HostRMSNormGradient(const Context& dev_ctx,
 
     const dim3 threads3(32, 8, 1);
     const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
-    const int nshared3 = threads3.x * threads3.y * sizeof(U);
+    const auto nshared3(threads3.x * threads3.y * sizeof(U));
+
     cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
         part_grad_gamma.data<U>(),
         part_grad_gamma.data<U>(), /* unused */
@@ -100,7 +103,7 @@ void HostRMSNormGradient(const Context& dev_ctx,
   const uint64_t maxGridY = dev_ctx.GetCUDAMaxGridDimSize()[1];
   const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
   const dim3 threads1(32, 4, 1);
-  int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
+  auto nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
 
   const V* gamma_tmp = gamma;
   cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
index 3cb34f6eaedfbe..c3a46a94db4a2c 100644
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -47,7 +47,7 @@ void RollGradKernel(const Context& dev_ctx,
     shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
   } else {
     for (int i = 0; i < rank; i++) {
-      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      auto dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
       int64_t size = input_dim[dim];
       if (size != 0) {
         shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
index 318551221b1ffb..dcd4ae617cb970 100644
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -47,7 +47,7 @@ void RollKernel(const Context& dev_ctx,
     shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
   } else {
     for (int i = 0; i < rank; i++) {
-      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      auto dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
       int64_t size = input_dim[dim];
 
       if (size != 0) {
diff --git a/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu
index ac61f86fed3e19..69eed3208adcda 100644
--- a/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu
@@ -121,7 +121,7 @@ __global__ void RowConvGradFilterImproved(const T *in,
   int xdim_sh_in = block_y;
   int xdim_sh_dout = block_y;
   int ydim_sh_in = block_x;
-  int ydim_sh_dout = block_x + future_context - 1;
+  auto ydim_sh_dout = block_x + future_context - 1;
   int ydim_sh_dfilter = block_y;
 
   T *sh_in = mem;
@@ -154,7 +154,7 @@ __global__ void RowConvGradFilterImproved(const T *in,
       __syncthreads();
 
       if (thy < future_context - 1) {
-        int pos_offset = pos - future_context + 1;
+        auto pos_offset = pos - future_context + 1;
         sh_dout[thx * ydim_sh_dout + thy] =
             (d < input_dim && pos_offset >= start)
                 ? dout[pos_offset * input_dim + d]
diff --git a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu
index 77ca140bd22ad2..84bd0c43478a13 100644
--- a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu
@@ -68,7 +68,7 @@ struct SequenceExpandGradFunctor<phi::GPUContext, T> {
                           ref_lod.size()));
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
-    int thread_z = 1024 / thread_x / thread_y;
+    auto thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
diff --git a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu
index 9c8817431efdbf..62d367916fae09 100644
--- a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu
@@ -126,7 +126,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
       int thread_x =
           std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
       int thread_y = 16;
-      int thread_z = 1024 / thread_x / thread_y;
+      auto thread_z = 1024 / thread_x / thread_y;
       int block_x = static_cast<int>(ref_lod.size());
       dim3 block_size(thread_x, thread_y, thread_z);
       dim3 grid_size(block_x, 1);
diff --git a/paddle/phi/kernels/gpu/shuffle_channel.h b/paddle/phi/kernels/gpu/shuffle_channel.h
index 59e067374e113d..50c574a1484e38 100644
--- a/paddle/phi/kernels/gpu/shuffle_channel.h
+++ b/paddle/phi/kernels/gpu/shuffle_channel.h
@@ -37,9 +37,11 @@ __global__ void ShuffleChannel(const int nthreads,
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (size_t ii = index; ii < nthreads; ii += offset) {
-    const int n = index / group_row / group_column / len;
+    const auto n(index / group_row / group_column / len);
+
     const int i = (index / group_column / len) % group_row;
-    const int j = index / len % group_column;
+    const auto j(index / len % group_column);
+
     const int k = index - (n * feature_map_size + (i * group_column + j) * len);
     T* p_o = output + n * feature_map_size + (j * group_row + i) * len;
     p_o[k] = input[index];
diff --git a/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu b/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu
index 3c130e4ec56751..3bad8066dc8611 100644
--- a/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu
@@ -40,7 +40,7 @@ void ShuffleChannelGradOpCUDAKernel(const Context& dev_ctx,
 
   int blocks = NumBlocks(out_grad.numel());
   int threads = kNumCUDAThreads;
-  int count = num * group_column * group_row * sp_sz;
+  auto count = num * group_column * group_row * sp_sz;
 
   ShuffleChannel<T><<<blocks, threads, 0, dev_ctx.stream()>>>(count,
                                                               feature_map_size,
diff --git a/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu b/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu
index 6348a486f2e735..0270753c61900c 100644
--- a/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu
@@ -36,7 +36,7 @@ void ShuffleChannelOpCUDAKernel(const Context& dev_ctx,
   int group_row = group;
   int group_column = channel / group_row;
   // count is the product of NCHW same as numel()
-  int count = num * group_column * group_row * sp_sz;
+  auto count = num * group_column * group_row * sp_sz;
 
   int blocks = NumBlocks(out->numel());
   int threads = kNumCUDAThreads;
diff --git a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu
index fde94d4b70a188..81995c36ded47a 100644
--- a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu
+++ b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu
@@ -268,7 +268,7 @@ __global__ void GetSlogDetV2FromLU(const T* lu_data,
                                    T* logdet_data) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size) {
-    int offset_lu = idx * n * n;
+    auto offset_lu = idx * n * n;
     int offset_ipiv = idx * n;
     T det_val = T(1.0);
     for (int i = 0; i < n; i++) {
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index d7df2581f9656e..a2f6a2eb39fbca 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -529,7 +529,7 @@ __global__ void KeMatrixTopPBeamTopKFt(const T* src,
           count_iter_begin[bid] += 1;
           if (val < threshold_now) {
             // don't sample low score token
-            int start_id = i == 0 ? 0 : i - 1;
+            auto start_id = i == 0 ? 0 : i - 1;
             for (int j = start_id; j >= 0; j--) {
               float val_now = static_cast<float>(beam_max[j].v);
               if (val_now >= threshold_now || j == 0) {
diff --git a/paddle/phi/kernels/gpu/tril_indices_kernel.cu b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
index be83f28451166b..1193284f074bef 100644
--- a/paddle/phi/kernels/gpu/tril_indices_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
@@ -24,7 +24,7 @@ namespace phi {
 
 template <typename T>
 __device__ inline int resolve_root_int(int b, int cX4, int x, int32_t sign) {
-  int bXb_cX4 = b * b - cX4;
+  auto bXb_cX4 = b * b - cX4;
   double sr = ::sqrt(static_cast<double>(bXb_cX4));
   T res = ::__double2ll_rd((-b + sign * sr) / 2);
   if (bXb_cX4 != static_cast<int>(sr * sr)) {
diff --git a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu
index 98d2bfbea0743b..7bddfce64af2a8 100644
--- a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu
@@ -37,7 +37,7 @@ __global__ void KernelUnpool2dMaxGrad(const int64_t nthreads,
                                       T* input_grad) {
   CUDA_KERNEL_LOOP_TYPE(linearIndex, nthreads, int64_t) {
     int c = (linearIndex / input_width / input_height) % channels;
-    int n = linearIndex / input_width / input_height / channels;
+    auto n = linearIndex / input_width / input_height / channels;
     output_grad += (n * channels + c) * output_height * output_width;
     IndT maxind = indices_data[linearIndex];
     input_grad[linearIndex] = output_grad[maxind];
@@ -60,7 +60,7 @@ __global__ void KernelUnpool3dMaxGrad(const int64_t nthreads,
                                       T* input_grad) {
   CUDA_KERNEL_LOOP_TYPE(linearIndex, nthreads, int64_t) {
     int c = (linearIndex / input_depth / input_width / input_height) % channels;
-    int n = linearIndex / input_depth / input_width / input_height / channels;
+    auto n = linearIndex / input_depth / input_width / input_height / channels;
     output_grad +=
         (n * channels + c) * output_depth * output_height * output_width;
     IndT maxind = indices_data[linearIndex];
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index af6169ba9cb7b1..d0897612fa65b3 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -233,7 +233,7 @@ void ViterbiDecodeKernel(const Context& dev_ctx,
   std::vector<DenseTensor> historys;
   // We create tensor buffer in order to avoid allocating memory frequently
   // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  auto buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
   DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
   funcs::TensorBuffer int_tensor_buffer(int_buffer);
   // create float tensor buffer
diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
index ce22758e407862..82dac78871fe01 100644
--- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
@@ -49,7 +49,7 @@ __device__ __forceinline__ float GenKeyFromWeight(
     random_num2 = rng.Random64();
     seed_count++;
   } while (!random_num2);
-  int one_bit = __clzll(random_num2) + seed_count * 64;
+  auto one_bit = __clzll(random_num2) + seed_count * 64;
   u *= exp2f(-one_bit);
   float logk = (log1pf(u) / logf(2.0)) * (1 / weight);
   return logk;
@@ -261,7 +261,7 @@ __launch_bounds__(BLOCK_SIZE) __global__
     const int tx = threadIdx.x;
 #pragma unroll
     for (int j = 0; j < ITEMS_PER_THREAD; j++) {
-      int idx = BLOCK_SIZE * j + tx;
+      auto idx = BLOCK_SIZE * j + tx;
       if (idx < neighbor_count) {
         float thread_weight = edge_weight[start + idx];
         weight_keys[j] = GenKeyFromWeight(thread_weight, rng);
@@ -274,14 +274,14 @@ __launch_bounds__(BLOCK_SIZE) __global__
     BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped(
         weight_keys, neighbor_idxs, max_sample_count, valid_count);
     __syncthreads();
-    const int stride = BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count;
+    const auto stride(BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count);
 
     for (int idx_offset = ITEMS_PER_THREAD * BLOCK_SIZE;
          idx_offset < neighbor_count;
          idx_offset += stride) {
 #pragma unroll
       for (int j = 0; j < ITEMS_PER_THREAD; j++) {
-        int local_idx = BLOCK_SIZE * j + tx - max_sample_count;
+        auto local_idx = BLOCK_SIZE * j + tx - max_sample_count;
         int target_idx = idx_offset + local_idx;
         if (local_idx >= 0 && target_idx < neighbor_count) {
           float thread_weight = edge_weight[start + target_idx];
@@ -299,7 +299,7 @@ __launch_bounds__(BLOCK_SIZE) __global__
     }
 #pragma unroll
     for (int j = 0; j < ITEMS_PER_THREAD; j++) {
-      int idx = j * BLOCK_SIZE + tx;
+      auto idx = j * BLOCK_SIZE + tx;
       if (idx < max_sample_count) {
         sample_output[offset + idx] = in_rows[start + neighbor_idxs[j]];
         if (return_eids) {
diff --git a/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu
index a4821e6534463d..c1cea289d6b09c 100644
--- a/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu
@@ -40,7 +40,7 @@ __global__ void YoloBoxHeadCudaKernel(const T* input,
     return;
   }
   const int grids_num = grid_size_x * grid_size_y;
-  const int bbindex = y_id * grid_size_x + x_id;
+  const auto bbindex(y_id * grid_size_x + x_id);
 
   // objectness
   output[bbindex + grids_num * (z_id * (5 + class_num) + 4)] =
@@ -81,7 +81,8 @@ void YoloBoxHeadKernel(const Context& dev_ctx,
   const T* input_data = x.data<T>();
   T* output_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
   auto stream = dev_ctx.stream();
-  const int volume = x_dims[1] * h * w;
+  const auto volume(x_dims[1] * h * w);
+
   dim3 block(16, 16, 4);
   dim3 grid((grid_size_x / block.x) + 1,
             (grid_size_y / block.y) + 1,
diff --git a/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
index 1e2613c5cab773..7200459df3fe7c 100644
--- a/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
@@ -147,7 +147,8 @@ __global__ void YoloBoxNum(const float* input,
   }
 
   const int grids_num = grid_size * grid_size;
-  const int bbindex = y_id * grid_size + x_id;
+  const auto bbindex(y_id * grid_size + x_id);
+
   float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)];
   if (objectness < prob_thresh) {
     return;
@@ -178,7 +179,8 @@ __global__ void YoloTensorParseKernel(const float* input,
   const float pic_h = im_shape_data[0] / im_scale_data[0];
   const float pic_w = im_shape_data[1] / im_scale_data[1];
   const int grids_num = grid_size * grid_size;
-  const int bbindex = y_id * grid_size + x_id;
+  const auto bbindex(y_id * grid_size + x_id);
+
   float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)];
   if (objectness < prob_thresh) {
     return;
@@ -434,7 +436,7 @@ void YoloBoxPostKernel(const Context& dev_ctx,
       int c = boxes_input_dims[input_id][1];
       int h = boxes_input_dims[input_id][2];
       int w = boxes_input_dims[input_id][3];
-      int ts_id = batch_id * boxes_input.size() + input_id;
+      auto ts_id = batch_id * boxes_input.size() + input_id;
       int bbox_count_max_alloc = ts_info[ts_id].bbox_count_max_alloc;
 
       YoloTensorParseCuda(
@@ -494,7 +496,7 @@ void YoloBoxPostKernel(const Context& dev_ctx,
   for (int batch_id = 0; batch_id < batch; batch_id++) {
     std::vector<Detection> bbox_det_vec;
     for (int input_id = 0; input_id < boxes_input.size(); input_id++) {
-      int ts_id = batch_id * boxes_input.size() + input_id;
+      auto ts_id = batch_id * boxes_input.size() + input_id;
       int bbox_count = ts_info[ts_id].bbox_count_host;
       if (bbox_count <= 0) {
         continue;
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 6a4280faa4aea0..380f448aa829ce 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -129,8 +129,8 @@ void ConvCudnnGradKernelImplV7(
              &o_w);
   }
 
-  int group_offset_in = i_c / groups * i_h * i_w * i_d;
-  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  auto group_offset_in = i_c / groups * i_h * i_w * i_d;
+  auto group_offset_out = o_c / groups * o_h * o_w * o_d;
   int group_offset_filter = transformed_filter_channel->numel() / groups;
 
 // ------------------- cudnn backward algorithm ---------------------
@@ -1186,8 +1186,8 @@ void ConvCudnnGradGradKernel(
            &o_h,
            &o_w);
 
-  int group_offset_in = i_c / groups * i_h * i_w * i_d;
-  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  auto group_offset_in = i_c / groups * i_h * i_w * i_d;
+  auto group_offset_out = o_c / groups * o_h * o_w * o_d;
   int group_offset_filter = W->numel() / groups;
 
   ScalingParamType<T> alpha = 1.0f;
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 42ff83420526a9..47a7f6f58ca19b 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -142,8 +142,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
              &o_w);
   }
 
-  int group_offset_in = i_c / groups * i_h * i_w * i_d;
-  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  auto group_offset_in = i_c / groups * i_h * i_w * i_d;
+  auto group_offset_out = o_c / groups * o_h * o_w * o_d;
   int group_offset_filter = transformed_filter_channel->numel() / groups;
   // ------------------- cudnn conv workspace ---------------------
   size_t workspace_size = 0;  // final workspace to allocate.
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index ee222d3291b3cd..362891fc2eb067 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -290,8 +290,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx,
 
   // ------------------- cudnn conv backward data ---------------------
   // FIxME(typhoonzero): template type T may not be the same as cudnn call.
-  int x_offset = x.numel() / x.dims()[0] / groups;
-  int dout_offset =
+  auto x_offset = x.numel() / x.dims()[0] / groups;
+  auto dout_offset =
       transformed_dout.numel() / transformed_dout.dims()[0] / groups;
   int filter_offset = filter.numel() / groups;
   ScalingParamType<T> alpha = 1.0f;
@@ -835,9 +835,9 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
            &o_h,
            &o_w);
 
-  int group_offset_in =
+  auto group_offset_in =
       transformed_x.numel() / transformed_x.dims()[0] / groups;
-  int group_offset_out =
+  auto group_offset_out =
       transformed_dout.numel() / transformed_dout.dims()[0] / groups;
   int group_offset_filter = filter.numel() / groups;
 
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 26b8827620c759..6a8e01d3d94bf8 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -114,8 +114,8 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
 #endif
 
   // ------------------- cudnn conv transpose forward ---------------------
-  int x_offset = transformed_x->numel() / transformed_x->dims()[0] / groups;
-  int out_offset =
+  auto x_offset = transformed_x->numel() / transformed_x->dims()[0] / groups;
+  auto out_offset =
       transformed_out->numel() / transformed_out->dims()[0] / groups;
   int filter_offset = filter->numel() / groups;
   ScalingParamType<T> alpha = 1.0f;
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 7706299a92d92c..240f1a4f3c7216 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -1295,7 +1295,7 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
       // use 128 threads per block to maximize gpu utilization
       constexpr int threads_per_block = 128;
 
-      int warps_per_block = (threads_per_block / warp_size);
+      auto warps_per_block = (threads_per_block / warp_size);
       int batches_per_block = warps_per_block * batches_per_warp;
       IndexType blocks = (N + batches_per_block - 1) / batches_per_block;
       dim3 threads(warp_size, warps_per_block, 1);
@@ -1389,7 +1389,7 @@ void SoftmaxBackwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
 
       constexpr int threads_per_block = 128;
 
-      int warps_per_block = (threads_per_block / warp_size);
+      auto warps_per_block = (threads_per_block / warp_size);
       int batches_per_block = warps_per_block * batches_per_warp;
       IndexType blocks = (N + batches_per_block - 1) / batches_per_block;
       dim3 threads(warp_size, warps_per_block, 1);
diff --git a/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h b/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h
index aee6bd1e5ab9cc..40abc56e5b7c4b 100644
--- a/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h
+++ b/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h
@@ -112,7 +112,7 @@ void AnchorGeneratorOpKernel(const Context& dev_ctx,
     var_et(0, i) = variances[i];
   }
 
-  int anchor_num = feature_height * feature_width * num_anchors;
+  auto anchor_num = feature_height * feature_width * num_anchors;
   auto var_dim = vars->dims();
   vars->Resize({anchor_num, static_cast<int>(variances.size())});
 
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index fe734bfb3dc441..b022b7774dfa1f 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -50,8 +50,8 @@ void ApplyBroadcast(const Context& dev_ctx,
   Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
   std::vector<int64_t> new_input_dims_vec(out_rank);
   for (int i = 0; i < out_rank; i++) {
-    int in_axis = in_rank - i - 1;
-    int out_axis = out_rank - i - 1;
+    auto in_axis = in_rank - i - 1;
+    auto out_axis = out_rank - i - 1;
 
     bcast_dims[out_axis] = output_dims[out_axis];
     new_input_dims_vec[out_axis] = 1;
diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
index f4dd48013d4b04..32c2eaba5ad20e 100644
--- a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
@@ -66,9 +66,11 @@ struct EyeFunctor {
 
   HOSTDEVICE void operator()(size_t index) const {
     const int global_row = index / n_;
-    const int col = index - global_row * n_;
+    const auto col(index - global_row * n_);
+
     const int batch = global_row / m_;
-    const int row = global_row - batch * m_;
+    const auto row(global_row - batch * m_);
+
     output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
   }
 
@@ -103,10 +105,12 @@ struct MatrixSetDiagFunctor {
 
   HOSTDEVICE void operator()(size_t index) const {
     const int batch_and_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_diag_index * max_diag_len_;
+    const auto index_in_the_diagonal(index -
+                                     batch_and_diag_index * max_diag_len_);
+
     const int batch = batch_and_diag_index / num_diags_;
-    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
+    const auto diag_index_in_input(batch_and_diag_index - batch * num_diags_);
+
     // diag_index=0 refers to the main diagonal
     const int diag_index = upper_diag_index_ - diag_index_in_input;
     // shift down for subdiagonal if diag_index < 0
@@ -119,7 +123,8 @@ struct MatrixSetDiagFunctor {
     // Upper-bound checks for diagonals shorter than max_diag_len.
     // y_index and x_index are nonnegative by construction.
     if (y_index < m_ && x_index < n_) {
-      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
+      const auto out_index(batch * m_ * n_ + y_index * n_ + x_index);
+
       output_[out_index] = diag_[index];
     }
   }
@@ -152,11 +157,13 @@ struct MatrixDiagPartFunctor {
 
   HOSTDEVICE void operator()(size_t index) const {
     const int batch_and_mapped_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_mapped_diag_index * max_diag_len_;
+    const auto index_in_the_diagonal(index - batch_and_mapped_diag_index *
+                                                 max_diag_len_);
+
     const int batch = batch_and_mapped_diag_index / num_diags_;
-    const int mapped_diag_index =
-        batch_and_mapped_diag_index - batch * num_diags_;
+    const auto mapped_diag_index(batch_and_mapped_diag_index -
+                                 batch * num_diags_);
+
     // diag_index=0 refers to the main diagonal
     const int diag_index = upper_diag_index_ - mapped_diag_index;
     // shift down for subdiagonal if diag_index < 0
diff --git a/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h b/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h
index 761fc62e20b263..3edb2b9495d52d 100644
--- a/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h
+++ b/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h
@@ -109,9 +109,10 @@ void CollectFpnProposalsOpKernel(
     integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
   }
 
-  const int batch_size = (num_size == 0)
-                             ? multi_layer_rois[0]->lod().back().size() - 1
-                             : multi_rois_num[0]->numel();
+  const auto batch_size((num_size == 0)
+                            ? multi_layer_rois[0]->lod().back().size() - 1
+                            : multi_rois_num[0]->numel());
+
   // concatenate all fpn rois scores into a list
   // create a vector to store all scores
   std::vector<ScoreWithID<T>> scores_of_all_rois(
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
index c6dd1cf7df4871..7e76e9489a3a23 100644
--- a/paddle/phi/kernels/impl/diag_embed_impl.h
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -83,8 +83,8 @@ void DiagEmbedKernel(const Context& dev_ctx,
   set_zero(dev_ctx, out, static_cast<T>(0.0));
 
   auto out_dims = out->dims();
-  int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1;
-  int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2;
+  auto dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1;
+  auto dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2;
   auto stride = common::stride(out_dims);
   int64_t diag_size;
   int64_t storage_offset = 0;
diff --git a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
index f204d2efdc6d3a..1a9154d48fb0a6 100644
--- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
@@ -40,14 +40,14 @@ void FoldGradKernel(const Context& dev_ctx,
   const auto& x_dims = x_grad->dims();
   const int64_t batch_size = x_dims[0];
 
-  int output_height = (output_sizes[0] + 2 * paddings[0] -
-                       (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
-                          strides[0] +
+  auto output_height = (output_sizes[0] + 2 * paddings[0] -
+                        (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                           strides[0] +
+                       1;
+  auto output_width = (output_sizes[1] + 2 * paddings[1] -
+                       (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                          strides[1] +
                       1;
-  int output_width = (output_sizes[1] + 2 * paddings[1] -
-                      (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
-                         strides[1] +
-                     1;
 
   int64_t n_input_plane = x_dims[1];
   int64_t n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h
index a0ac45d3c6bfe6..a84a5577fef891 100644
--- a/paddle/phi/kernels/impl/fold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_kernel_impl.h
@@ -39,14 +39,14 @@ void FoldKernel(const Context& dev_ctx,
   phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kCFO, Context, T> col2im;
   const auto& x_dims = x.dims();
 
-  int output_height = (output_sizes[0] + 2 * paddings[0] -
-                       (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
-                          strides[0] +
+  auto output_height = (output_sizes[0] + 2 * paddings[0] -
+                        (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                           strides[0] +
+                       1;
+  auto output_width = (output_sizes[1] + 2 * paddings[1] -
+                       (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                          strides[1] +
                       1;
-  int output_width = (output_sizes[1] + 2 * paddings[1] -
-                      (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
-                         strides[1] +
-                     1;
 
   int64_t n_input_plane = x_dims[1];
   int64_t n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
diff --git a/paddle/phi/kernels/impl/im2sequence_kernel_impl.h b/paddle/phi/kernels/impl/im2sequence_kernel_impl.h
index a6265e5b30836f..444291aabd005a 100644
--- a/paddle/phi/kernels/impl/im2sequence_kernel_impl.h
+++ b/paddle/phi/kernels/impl/im2sequence_kernel_impl.h
@@ -26,8 +26,9 @@ namespace phi {
 
 inline int Im2SeqOutputSize(
     int input_size, int filter_size, int padding_0, int padding_1, int stride) {
-  const int output_size =
-      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  const auto output_size(
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1);
+
   return output_size;
 }
 
diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
index 1a23e6d845781d..95c9028aa1cdc0 100644
--- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -240,7 +240,7 @@ __global__ void ReduceAbsMaxKernel(const T* x,
   for (int row_idx = blockIdx.x; row_idx < rows; row_idx += gridDim.x) {
     for (int col_idx = threadIdx.x * VecSize; col_idx < cols;
          col_idx += blockDim.x * VecSize) {
-      int32_t linear_index = row_idx * cols + col_idx;
+      auto linear_index = row_idx * cols + col_idx;
       phi::Load<T, VecSize>(x + linear_index, &in_vec);
 #pragma unroll
       for (int i = 0; i < VecSize; ++i) {
@@ -284,7 +284,7 @@ __global__ void QuantActKernel(const T* x,
        linear_index < elem_cnt;
        linear_index += gridDim.x * blockDim.x * VecSize) {
     int row_idx = linear_index / cols;
-    int col_idx =
+    auto col_idx =
         linear_index - row_idx * cols;  // equal to linear_index % cols
     phi::Load<T, VecSize>(x + linear_index, &in_vec);
     int32_t local_outlier_idx = outlier_idx[col_idx / 32];
@@ -353,13 +353,13 @@ __global__ void SplitKernel(const T* x,
     if (linear_idx < sub_w_elem_cnt) {
       constexpr int32_t k_permute_const = 8;
       int32_t k_mod_16 = k_id % 16;
-      int32_t temp_k_expr_1 = k_mod_16 - k_mod_16 / 8 * 8;
+      auto temp_k_expr_1 = k_mod_16 - k_mod_16 / 8 * 8;
       int32_t temp_k_expr_2 = k_mod_16 / 8;
-      int32_t permute_kk = temp_k_expr_1 + temp_k_expr_2 +
-                           (temp_k_expr_2 + 1) % 2 * k_mod_16 * 2 / 2 +
-                           temp_k_expr_1 * temp_k_expr_2 + k_id / 16 * 16;
-      int32_t permute_index = permute_kk % 64 + permute_kk / 64 * 128 +
-                              64 * (row_idx % 2) + k * 2 * (row_idx / 2);
+      auto permute_kk = temp_k_expr_1 + temp_k_expr_2 +
+                        (temp_k_expr_2 + 1) % 2 * k_mod_16 * 2 / 2 +
+                        temp_k_expr_1 * temp_k_expr_2 + k_id / 16 * 16;
+      auto permute_index = permute_kk % 64 + permute_kk / 64 * 128 +
+                           64 * (row_idx % 2) + k * 2 * (row_idx / 2);
       int8_t shifted_weight = static_cast<int8_t>(
           static_cast<int32_t>(weight[permute_index]) - 128);
       sub_weight[row_idx * kfp_num + col_idx] =
@@ -431,7 +431,7 @@ __global__ void DequantMergeKernel(const int32_t* x,
   for (int row_idx = blockIdx.x; row_idx < m; row_idx += gridDim.x) {
     for (int col_idx = threadIdx.x * VecSize; col_idx < n;
          col_idx += blockDim.x * VecSize) {
-      int linear_idx = row_idx * n + col_idx;
+      auto linear_idx = row_idx * n + col_idx;
       phi::Load(x_fp + linear_idx, &x_fp_vec);
       phi::Load(x + linear_idx, &x_vec);
 #pragma unroll
diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
index 6f03f76eebbf23..d9b621b2e75683 100644
--- a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
@@ -26,7 +26,8 @@ struct IdentityMatrixFunctor {
   IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
 
   HOSTDEVICE void operator()(size_t index) const {
-    const int row = index / m_ % m_;
+    const auto row(index / m_ % m_);
+
     const int col = index % m_;
     output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
   }
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
index 3546b91d66fc12..c17a2a9ed20e9d 100644
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -40,7 +40,7 @@ void UnStackGradKernel(const Context &dev_ctx,
   for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-  int total_num = pre * n * post;
+  auto total_num = pre * n * post;
 
   thrust::device_vector<const T *> device_x_vec(x_datas);
   auto x_data_arr = device_x_vec.data().get();
diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
index 82c78aad85e5ef..1723068a33afe9 100644
--- a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
+++ b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
@@ -150,7 +150,7 @@ __global__ void weight_interleave_add_bias_kernel_wint4(int8_t* input_data_dev,
 #pragma unroll
     for (int idx = 0; idx < 8; ++idx) {
       const int offset = idx / 4;
-      const int src = (idx % 4) * 2 + offset;
+      const auto src((idx % 4) * 2 + offset);
 
       const int src_shift = src * 4;
       const int dst_shift = idx * 4;
@@ -314,7 +314,7 @@ __global__ void per_channel_quant_gpu_int4_row_pack(const T* weight_data,
       for (int i = 0; i < VectorSize / 2; ++i) {
         int8_t packed_int4s = 0;
         for (int pack = 0; pack < 2; ++pack) {
-          int vector_index = i * 2 + pack;
+          auto vector_index = i * 2 + pack;
           const float r_scale = 1 / static_cast<float>(scale[vector_index]);
           const float weight_elt =
               static_cast<float>(weight[vector_index]) * r_scale;
diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h
index bd1c4b1d865af2..eab8135a221740 100644
--- a/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h
+++ b/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h
@@ -224,8 +224,9 @@ void add_bias_and_interleave_inplace(int8_t* tensor_ptr, size_t num_elts) {
       uint32_t transformed_register = 0;
 
       for (int dest_idx = 0; dest_idx < 8; ++dest_idx) {
-        const int src_idx =
-            dest_idx < 4 ? 2 * dest_idx : 2 * (dest_idx - 4) + 1;
+        const auto src_idx(dest_idx < 4 ? 2 * dest_idx
+                                        : 2 * (dest_idx - 4) + 1);
+
         const int src_shift = 4 * src_idx;
         const int dest_shift = 4 * dest_idx;
 
@@ -264,8 +265,9 @@ void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor,
     for (int tile_row = 0; tile_row < B_ROWS_PER_MMA; ++tile_row) {
       for (int write_col = 0; write_col < num_vec_cols; ++write_col) {
         const int write_row = base_row + tile_row;
-        const int tile_read_row = 8 * (((tile_row % ELTS_PER_REG) / 2)) +
-                                  tile_row % 2 + 2 * (tile_row / ELTS_PER_REG);
+        const auto tile_read_row(8 * (((tile_row % ELTS_PER_REG) / 2)) +
+                                 tile_row % 2 + 2 * (tile_row / ELTS_PER_REG));
+
         const int read_row = base_row + tile_read_row;
         const int read_col = write_col;
 
diff --git a/paddle/phi/kernels/onednn/concat_kernel.cc b/paddle/phi/kernels/onednn/concat_kernel.cc
index 2e7d79a330cee7..a72725909ea744 100644
--- a/paddle/phi/kernels/onednn/concat_kernel.cc
+++ b/paddle/phi/kernels/onednn/concat_kernel.cc
@@ -74,7 +74,7 @@ class ConcatOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
 
 bool ConcatCheckIfOneDNNSupport(const KernelContext* dev_ctx) {
   auto input0 = dev_ctx->InputAt<DenseTensor>(0);
-  int batch_size =
+  auto batch_size =
       !input0.lod().empty() ? input0.lod()[0].size() - 1 : input0.dims()[0];
   if (dev_ctx->InputsSize() > 64 && batch_size < 1000) {
     return false;
diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
index b1b6db198e3a12..0325d980559e26 100644
--- a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
@@ -52,9 +52,9 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
   for (size_t i = 0; i < x_bd_dims->size() - 2; ++i) {
     (*out_bd_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
   }
-  int h_idx =
+  auto h_idx =
       trans_x ? x_bd_dims->size() - 1 : x_bd_dims->size() - 2;  // NOLINT
-  int w_idx =
+  auto w_idx =
       trans_y ? y_bd_dims->size() - 2 : y_bd_dims->size() - 1;  // NOLINT
 
   (*out_bd_dims)[x_bd_dims->size() - 2] = (*x_bd_dims)[h_idx];
diff --git a/paddle/phi/kernels/onednn/multi_gru_kernel.cc b/paddle/phi/kernels/onednn/multi_gru_kernel.cc
index 5a7250fc312fc6..debfb9c6d1194a 100644
--- a/paddle/phi/kernels/onednn/multi_gru_kernel.cc
+++ b/paddle/phi/kernels/onednn/multi_gru_kernel.cc
@@ -145,11 +145,11 @@ class MultiGRUHandler {
               layers_ * 2,
               scale_weights.size()));
 
-      const int weights_scale_mask =
+      const auto weights_scale_mask(
           0 +
           (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
-          +
-          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+          + (1 << 4));
+      // bit, indicating the unique scales for `o` dim in `ldigo`
 
       int w_scale_num = scale_weights.size();
       for (int i = 0; i < w_scale_num; ++i) {
@@ -371,7 +371,7 @@ class MultiGRUHandler {
 
       auto* weight_x_data =
           reinterpret_cast<float*>(user_memory.get_data_handle());
-      int idx = layer * 2 + (dir == R2L);
+      auto idx = layer * 2 + (dir == R2L);
       memcpy(weight_x_data,
              weights_x_[idx]->data<float>(),
              sizeof(float) * ICs[layer] * 3 * OCs[layer]);
@@ -414,7 +414,7 @@ class MultiGRUHandler {
       auto* weight_h_data =
           reinterpret_cast<float*>(user_memory.get_data_handle());
 
-      int idx = layer * 2 + (dir == R2L);
+      auto idx = layer * 2 + (dir == R2L);
       auto* user_weight_h_data = weights_h_[idx]->data<float>();
 
       auto src1_iter = user_weight_h_data;
@@ -465,7 +465,7 @@ class MultiGRUHandler {
           gru_pds_[{layer, dir}]->bias_desc(), engine_);
       auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
 
-      int idx = layer * 2 + (dir == R2L);
+      auto idx = layer * 2 + (dir == R2L);
       if (!biases_.empty() && biases_[idx]) {
         const float* user_bias_data =
             biases_[idx]->data<float>();  // Bias in oneDNN is always float
diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
index 10983a4ef75290..8c0a3f9c588c3d 100644
--- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
@@ -30,7 +30,7 @@ inline std::vector<int64_t> CalculateReducedDims(
   std::vector<int64_t> output_dims(common::vectorize(input->dims()));
   for (size_t i = 0; i < dims.size(); ++i) {
     // handle negative dims, f.e. "-1" means rightmost dimension
-    int index = (dims[i] >= 0) ? dims[i] : input->dims().size() + dims[i];
+    auto index = (dims[i] >= 0) ? dims[i] : input->dims().size() + dims[i];
     output_dims[index] = 1;
   }
 
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 11481a8b0249a8..c1f05936e5a668 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -543,7 +543,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
   temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1];
   for (int stride = 1; stride <= stride_size; stride *= 2) {
     __syncthreads();
-    int index = (tidx + 1) * 2 * stride - 1;
+    auto index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
       temp[index + index / 32] =
           compute(temp[index + index / 32],
@@ -552,7 +552,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
   }
   for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) {
     __syncthreads();
-    int index = (tidx + 1) * 2 * stride - 1;
+    auto index = (tidx + 1) * 2 * stride - 1;
     if ((index + stride) < (blockDim.x * 2)) {
       temp[index + stride + (stride + index) / 32] =
           compute(temp[index + stride + (stride + index) / 32],
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 75f510c13d18ff..e94e9b2916f6e9 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -140,7 +140,7 @@ struct BroadcastConfig {
       return kps::details::GetXpuReadLens(numel, 8, 64);
     }
     int max_buf_len = 512;
-    int buf_len = m / 16 * 16;
+    auto buf_len = m / 16 * 16;
     if (buf_len == 0) {
       buf_len = m;
     }
@@ -408,7 +408,7 @@ __device__ __inline__ void ReadData(Ty* dst,
             break;
           }
         }
-        int fix = thread_offset + idx * stride_nx + idy * stride_ny;
+        auto fix = thread_offset + idx * stride_nx + idy * stride_ny;
         mfence_local();
         GM2LM(src + fix, in_temp, sizeof(Tx));
         dst[idy * NX + idx] = static_cast<Ty>(in_temp[0]);
@@ -1194,7 +1194,7 @@ __device__ __inline__ void ReadDataBc(T* dst,
                                       const details::BroadcastConfig& config,
                                       int total_num_output,
                                       int read_lens) {
-  int thread_offset = block_offset + core_id() * read_lens;
+  auto thread_offset = block_offset + core_id() * read_lens;
 
   if (config.cmp_type == details::OptType::MNK_M1K) {
     ReadDataBcM1kMnk<T>(dst, src, thread_offset, config, read_lens);
@@ -1248,7 +1248,7 @@ __device__ __forceinline__ void ReadDataBc(
     const details::BroadcastConfig& config,
     int total_num_output,
     int read_lens = NX) {
-  int thread_offset = block_offset + core_id() * read_lens;
+  auto thread_offset = block_offset + core_id() * read_lens;
   __local__ T in_temp[NX];
 
   if (config.cmp_type == details::OptType::MNK_M1K) {
@@ -1286,7 +1286,7 @@ __device__ __forceinline__ void ReadDataBc(
  */
 template <typename T, int NX, int NY>
 __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
-  int thread_offset = block_offset + core_id() * NX;
+  auto thread_offset = block_offset + core_id() * NX;
 #pragma unroll
   for (int nx = 0; nx < NX; ++nx) {
     dst[nx] = static_cast<T>(thread_offset + nx);
diff --git a/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu b/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu
index 437094d1422d35..978869851fb096 100644
--- a/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu
+++ b/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu
@@ -55,7 +55,7 @@ phi::DenseTensor CheckMultipleUnsqueeze(const Context& dev_ctx,
   std::vector<bool> axes(ndim, false);
 
   for (int i = 0; i < dims.size(); i++) {
-    int tmp_dim = dims[i] >= 0 ? dims[i] : ndim + dims[i];
+    auto tmp_dim = dims[i] >= 0 ? dims[i] : ndim + dims[i];
     axes[tmp_dim] = true;
   }
 
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 6e413ef73098dd..9a6ae0b4fec68d 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -136,7 +136,7 @@ void DeserializeOnCPU(const Context& dev_ctx,
                       StringTensor* dst) {
   auto* strings_data = reinterpret_cast<const char*>(src.data<uint8_t>());
   auto* strings_offset = reinterpret_cast<const int*>(strings_data);
-  int numel = strings_offset[0] / sizeof(int) - 1;
+  auto numel = strings_offset[0] / sizeof(int) - 1;
   dst->Resize(common::make_ddim({numel}));
   dtype::pstring* dst_str = dev_ctx.template HostAlloc<dtype::pstring>(dst);
   for (int i = 0; i < numel; ++i) {

From 56e78de1dbf3184260bb2eb385aa79a2e70b93ae Mon Sep 17 00:00:00 2001
From: zrr1999 <2742392377@qq.com>
Date: Fri, 7 Nov 2025 09:09:57 +0000
Subject: [PATCH 2/2] fix

---
 paddle/phi/kernels/strings/gpu/copy_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 9a6ae0b4fec68d..66f0a67df075c4 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -136,7 +136,7 @@ void DeserializeOnCPU(const Context& dev_ctx,
                       StringTensor* dst) {
   auto* strings_data = reinterpret_cast<const char*>(src.data<uint8_t>());
   auto* strings_offset = reinterpret_cast<const int*>(strings_data);
-  auto numel = strings_offset[0] / sizeof(int) - 1;
+  int64_t numel = strings_offset[0] / sizeof(int) - 1;
   dst->Resize(common::make_ddim({numel}));
   dtype::pstring* dst_str = dev_ctx.template HostAlloc<dtype::pstring>(dst);
   for (int i = 0; i < numel; ++i) {