PaddlePaddle · zrr1999 · Nov 7, 2025 · Nov 7, 2025
diff --git a/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc b/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc
@@ -76,8 +76,9 @@ void AddPositionEncodingKernel(const Context& dev_ctx,
 
   const int half_size = enc_size / 2;
   for (int i = 0; i < batch_size; ++i) {
-    const int max_length =
-        x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
+    const auto max_length(x_lod.empty() ? max_seq_len
+                                        : x_lod[0][i + 1] - x_lod[0][i]);
+
     for (int j = 0; j < max_length; ++j) {
       for (int k = 0; k < half_size; ++k) {
         const double val =

diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -181,7 +181,7 @@ void BatchNormGradFunctor(const Context& dev_ctx,
     bias_arr.setZero();
   }
 
-  int scale_coeff = use_global_stats ? 1 : N * sample_size;
+  auto scale_coeff = use_global_stats ? 1 : N * sample_size;
   const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coeff;
 
   DenseTensor dy_sum;

diff --git a/paddle/phi/kernels/cpu/box_coder_kernel.cc b/paddle/phi/kernels/cpu/box_coder_kernel.cc
@@ -120,7 +120,7 @@ void DecodeCenterSize(const DenseTensor *target_box,
       std::array<T, 4> var_data{1., 1., 1., 1.};
       T *var_ptr = var_data.data();
       size_t offset = i * col * len + j * len;
-      int prior_box_offset = axis == 0 ? j * len : i * len;
+      auto prior_box_offset = axis == 0 ? j * len : i * len;
 
       T prior_box_width = prior_box_data[prior_box_offset + 2] -
                           prior_box_data[prior_box_offset] +
@@ -135,7 +135,7 @@ void DecodeCenterSize(const DenseTensor *target_box,
 
       T target_box_center_x = 0, target_box_center_y = 0;
       T target_box_width = 0, target_box_height = 0;
-      int prior_var_offset = axis == 0 ? j * len : i * len;
+      auto prior_var_offset = axis == 0 ? j * len : i * len;
       if (var_size == 2) {
         std::memcpy(var_ptr,
                     prior_box_var->data<T>() + prior_var_offset,

diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -112,8 +112,8 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx,
     std::vector<int> reduce_dims_vec;
     std::vector<int> reshape_dims_vec;
     for (int j = 0; j < in_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
+      auto out_axis = out_rank - j - 1;
+      auto in_axis = in_rank - j - 1;
 
       reshape_dims_vec.push_back(static_cast<int>(input_dims[j]));
       if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {

diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
@@ -77,8 +77,9 @@ inline int ConvOutSize(int input_size,
                        int pad_left,
                        int pad_right,
                        int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size =
+  const auto dkernel(dilation * (filter_size - 1) + 1);
+
+  auto output_size =
       (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
 
   PADDLE_ENFORCE_GT(

diff --git a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
@@ -95,8 +95,8 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx,
       const int remain = d / axis_dim;
       for (int i = 0; i < n; ++i) {         // for each sample_1_dim
         for (int j = 0; j < remain; j++) {  // for each sample_other_dims
-          int idx = i * remain + j;  // this sample's label_idx. for 1d case,
-                                     // remain=1 and j=0, so, idx = i
+          auto idx = i * remain + j;  // this sample's label_idx. for 1d case,
+                                      // remain=1 and j=0, so, idx = i
           auto lbl = static_cast<int64_t>(label_data[idx]);  // NOLINT
           if (lbl == ignore_index) {
             for (int k = 0; k < axis_dim; ++k) {  // for each class id's label
@@ -147,8 +147,8 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx,
     const int remain = d / axis_dim;
     for (int i = 0; i < n; ++i) {         // for each sample_1_dim
       for (int j = 0; j < remain; j++) {  // for each sample_other_dims
-        int idx = i * remain + j;  // this sample's label_idx. for 1d case,
-                                   // remain=1 and j=0, so, idx = i
+        auto idx = i * remain + j;  // this sample's label_idx. for 1d case,
+                                    // remain=1 and j=0, so, idx = i
         auto lbl = static_cast<int64_t>(label_data[idx]);  // NOLINT
         if (lbl == ignore_index) {
           for (int k = 0; k < axis_dim; ++k) {  // for each class id's label

diff --git a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
@@ -33,7 +33,7 @@ void DistributeFpnProposalsKernel(
     std::vector<DenseTensor*> multi_fpn_rois,
     std::vector<DenseTensor*> multi_level_rois_num,
     DenseTensor* restore_index) {
-  const int num_level = max_level - min_level + 1;
+  const auto num_level(max_level - min_level + 1);
 
   // check that the fpn_rois is not empty
   if (!rois_num.get_ptr()) {

diff --git a/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc b/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc
@@ -82,7 +82,7 @@ void LookupTableDequantKernel(const Context &dev_ctx,
               ids[i]));
       float min = *(table + ids[i] * quant_number);
       float max = *(table + ids[i] * quant_number + 1);
-      int offset = ids[i] * quant_number + 2;
+      auto offset = ids[i] * quant_number + 2;
       const unsigned char *tensor_buf =
           reinterpret_cast<const unsigned char *>(table + offset);
       dequant(

diff --git a/paddle/phi/kernels/cpu/lrn_kernel.cc b/paddle/phi/kernels/cpu/lrn_kernel.cc
@@ -91,7 +91,7 @@ struct LRNFunctor<phi::CPUContext, T> {
       }
       for (int c = 1; c < C; ++c) {
         // copy previous scale
-        int mid_offset = i * fea_size + c * img_size;
+        auto mid_offset = i * fea_size + c * img_size;
         std::memcpy(mdata + mid_offset,
                     mdata + mid_offset - img_size,
                     img_size * sizeof(T));

diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -42,7 +42,7 @@ void LapackSVD(const T* x_data,
   int mn = std::min(rows, cols);
   T* a = const_cast<T*>(x_data);  // NOLINT
   int lda = rows;
-  int lwork = 3 * mn + std::max(mx, 7 * mn);
+  auto lwork = 3 * mn + std::max(mx, 7 * mn);
   std::vector<phi::dtype::Real<T>> rwork(
       std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn));
   std::vector<T> work(lwork);

diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -79,12 +79,12 @@ void PsroiPoolGradKernel(const Context& dev_ctx,
       int pw = i % pooled_width;
       int ph = (i / pooled_width) % pooled_height;
       int c = (i / pooled_width / pooled_height) % output_channels;
-      int n = i / pooled_width / pooled_height / output_channels;
+      auto n = i / pooled_width / pooled_height / output_channels;
 
       // set roi_batch_id
       int roi_batch_id = rois_batch_id_data[n];
-      int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-      int input_offset =
+      auto input_channel = (c * pooled_height + ph) * pooled_width + pw;
+      auto input_offset =
           (roi_batch_id * input_channels + input_channel) * height * width;
       T* offset_dx_data = dx_data + input_offset;
 
@@ -124,7 +124,7 @@ void PsroiPoolGradKernel(const Context& dev_ctx,
       T diff_val = is_empty ? 0. : dout_data[i] / bin_area;
       for (int ih = hstart; ih < hend; ++ih) {
         for (int iw = wstart; iw < wend; ++iw) {
-          int input_index = ih * width + iw;
+          auto input_index = ih * width + iw;
           offset_dx_data[input_index] += diff_val;
         }
       }

diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -148,7 +148,7 @@ void PsroiPoolKernel(const Context& dev_ctx,
           wend = std::min(std::max(wend, 0), width);
 
           int output_index = out_row_offset + pw;
-          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          auto input_channel = (c * pooled_height + ph) * pooled_width + pw;
           int input_plane_offset = static_cast<int>(
               roi_batch_id * in_stride[0] + input_channel * in_stride[1]);
           const T* offset_input_data = input_data + input_plane_offset;

diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -99,7 +99,7 @@ void ResetParameterVector(const std::vector<TensorType>& raw_params_vec,
     for (int j = 0; j < layer_weight_size; j++) {
       int k = j % 4;
       const int& section = j / 4;
-      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      auto tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
       if (k >= 2) {
         tensor_idx += bias_start_idx;
       }
@@ -217,8 +217,8 @@ void AllocateReserveData(const Context& dev_ctx,
   int direction_num = is_bidirec ? 2 : 1;
   int time_step = input->dims()[0];
   int batch_size = input->dims()[1];
-  int block_size = direction_num * time_step * batch_size * hidden_size;
-  int hidden_data_idx = (num_layers - 1);
+  auto block_size = direction_num * time_step * batch_size * hidden_size;
+  auto hidden_data_idx = (num_layers - 1);
   if (is_lstm(mode)) {
     hidden_data_idx += (gate_num + 2) * num_layers;
   } else if (is_gru(mode)) {

diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -384,7 +384,7 @@ struct GradLayer {
       const std::string& mode) {
     int direction_num = is_bidirec ? 2 : 1;
     int current_reverse_idx = is_reverse ? 1 : 0;
-    int current_layer_idx = direction_num * layer_idx + current_reverse_idx;
+    auto current_layer_idx = direction_num * layer_idx + current_reverse_idx;
     int begin_idx = 0;
     if (is_reverse) {
       begin_idx = time_step;

diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -175,7 +175,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
           out_grad_data + n * out_stride[0] + c * out_stride[1];
       for (int ph = 0; ph < pooled_height; ++ph) {
         for (int pw = 0; pw < pooled_width; ++pw) {
-          int pool_index = ph * pooled_width + pw;
+          auto pool_index = ph * pooled_width + pw;
           T out_grad_this_bin = batch_out_grad_data[pool_index];
           int roi_bin_grid_h = (sampling_ratio > 0)
                                    ? sampling_ratio

diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -86,7 +86,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
       for (int c = 0; c < channels; ++c) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
+            auto pool_index = ph * pooled_width + pw;
             if (arg_max_data[pool_index] >= 0) {
               auto index = arg_max_data[pool_index];
               batch_grad_data[index] += out_grad_data[pool_index];

diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -135,7 +135,7 @@ void RoiPoolKernel(const Context& dev_ctx,
           wstart = std::min(std::max(wstart + box_start_w, 0), width);
           wend = std::min(std::max(wend + box_start_w, 0), width);
 
-          const int pool_index = ph * pooled_width + pw;
+          const auto pool_index(ph * pooled_width + pw);
 
           // Define an empty pooling region to be zero
           bool is_empty = (hend <= hstart) || (wend <= wstart);
@@ -145,7 +145,8 @@ void RoiPoolKernel(const Context& dev_ctx,
 
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              const int index = h * width + w;
+              const auto index(h * width + w);
+
               if (batch_data[index] > output_data[pool_index]) {
                 output_data[pool_index] = batch_data[index];
                 arg_max_data[pool_index] = index;

diff --git a/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc b/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc
@@ -46,7 +46,7 @@ struct SequenceExpandGradFunctor<phi::CPUContext, T> {
         if (x_seq_len == 0) continue;
         auto dx_sub = dx->Slice(x_start, x_end);
         dx_sub.Resize(common::flatten_to_1d(dx_sub.dims()));
-        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_end = dout_offset + repeat_num * x_seq_len;
         auto dout_sub = dout.Slice(dout_offset, dout_end);
         dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
         phi::funcs::ColwiseSum<phi::CPUContext, T> col_sum;

diff --git a/paddle/phi/kernels/cpu/svd_kernel.cc b/paddle/phi/kernels/cpu/svd_kernel.cc
@@ -82,8 +82,8 @@ void BatchSvd(const T* X,
   // NOTE: this function is row major, because this function called the lapack.
   int stride = rows * cols;
   int k = std::min(rows, cols);
-  int stride_u = full ? rows * rows : k * rows;
-  int stride_v = full ? cols * cols : k * cols;
+  auto stride_u = full ? rows * rows : k * rows;
+  auto stride_v = full ? cols * cols : k * cols;
   for (int i = 0; i < batches; ++i) {
     LapackSvd<T>(X + i * stride,
                  U + i * stride_u,

diff --git a/paddle/phi/kernels/cpu/unpool_grad_kernel.cc b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc
@@ -113,8 +113,8 @@ void Unpool3dGrad(const Context& dev_ctx,
   const int output_depth = static_cast<int>(out.dims()[2]);
   const int output_height = static_cast<int>(out.dims()[3]);
   const int output_width = static_cast<int>(out.dims()[4]);
-  int input_feasize = input_depth * input_height * input_width;
-  int output_feasize = output_depth * output_height * output_width;
+  auto input_feasize = input_depth * input_height * input_width;
+  auto output_feasize = output_depth * output_height * output_width;
   const IndT* indices_data = indices.data<IndT>();
 
   for (int b = 0; b < batch_size; ++b) {

diff --git a/paddle/phi/kernels/cpu/unpool_kernel.cc b/paddle/phi/kernels/cpu/unpool_kernel.cc
@@ -107,8 +107,8 @@ void Unpool3d(const Context& dev_ctx,
   const int output_depth = static_cast<int>(out->dims()[2]);
   const int output_height = static_cast<int>(out->dims()[3]);
   const int output_width = static_cast<int>(out->dims()[4]);
-  int input_feasize = input_depth * input_height * input_width;
-  int output_feasize = output_depth * output_height * output_width;
+  auto input_feasize = input_depth * input_height * input_width;
+  auto output_feasize = output_depth * output_height * output_width;
   const T* input_data = x.data<T>();
   const IndT* indices_data = indices.data<IndT>();
   for (int b = 0; b < batch_size; ++b) {

diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -168,7 +168,7 @@ void ViterbiDecodeKernel(const Context& dev_ctx,
   std::vector<DenseTensor> historys;
   // We create tensor buffer in order to avoid allocating memory frequently
   // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  auto buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
   DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
   funcs::TensorBuffer int_tensor_buffer(int_buffer);
   // create float tensor buffer

diff --git a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc
@@ -282,7 +282,7 @@ void YoloLossKernel(const Context& dev_ctx,
           // If best IoU is bigger then ignore_thresh,
           // ignore the objectness loss.
           if (best_iou > ignore_thresh) {
-            int obj_idx = (i * mask_num + j) * stride + k * w + l;
+            auto obj_idx = (i * mask_num + j) * stride + k * w + l;
             obj_mask_data[obj_idx] = static_cast<T>(-1);
           }
           // all losses should be calculated if best IoU
@@ -339,7 +339,7 @@ void YoloLossKernel(const Context& dev_ctx,
                                stride,
                                score);
 
-        int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
+        auto obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
         obj_mask_data[obj_idx] = score;
 
         int label = gt_label_data[i * b + t];

diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -98,7 +98,7 @@ static int GetVectorizedSize(const DenseTensor* tensor) {
     return 1;
   }
   constexpr int max_load_bits = 128;
-  int valid_vec_size = max_load_bits / CHAR_BIT / element_size;
+  auto valid_vec_size = max_load_bits / CHAR_BIT / element_size;
   uint64_t address = reinterpret_cast<uint64_t>(tensor->data());
 
   // Currently, decide to deal with no more than 4 data once while adopting

diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1620,13 +1620,13 @@ void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
     int sub_width = W2 / head_number;
 
     for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
+      auto sub_matA_offset = (transA == CblasNoTrans)
+                                 ? i * (W1 / head_number)
+                                 : i * (W1 / head_number) * H1;
+      auto sub_matB_offset = (transB == CblasNoTrans)
+                                 ? i * (W2 / head_number)
+                                 : i * (W2 / head_number) * H2;
+      auto sub_matC_offset = i * W2 / head_number;
       for (int k = 0; k < batchCount; ++k) {
         a_array[k] = &A[k * strideA] + sub_matA_offset;
         b_array[k] = &B[k * strideB] + sub_matB_offset;
@@ -1665,12 +1665,12 @@ void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
     int sub_width = W1 / head_number;
 
     for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
+      auto sub_matA_offset = (transA == CblasNoTrans)
+                                 ? i * (W1 / head_number)
+                                 : i * (W1 / head_number) * H1;
+      auto sub_matB_offset = (transB == CblasNoTrans)
+                                 ? i * (W1 / head_number) * W2
+                                 : i * (W1 / head_number);
       int sub_matC_offset = i * W2;
       for (int k = 0; k < batchCount; ++k) {
         a_array[k] = &A[k * strideA] + sub_matA_offset;

diff --git a/paddle/phi/kernels/funcs/block_radix_topk.cuh b/paddle/phi/kernels/funcs/block_radix_topk.cuh
@@ -65,7 +65,7 @@ class BlockRadixTopKGlobalMemory {
     assert(k < size && k > 0);
     int target_k = k;
     UnsignedBits key_pattern = 0;
-    int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS;
+    auto digit_pos = sizeof(KeyT) * 8 - RADIX_BITS;
     for (; digit_pos >= 0; digit_pos -= RADIX_BITS) {
       UpdateSharedBins(data, size, digit_pos, key_pattern);
       InclusiveScanBins();
@@ -239,7 +239,7 @@ class BlockRadixTopKRegister {
 
 #pragma unroll
     for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      int idx = KEY * BLOCK_SIZE + tid_;
+      auto idx = KEY * BLOCK_SIZE + tid_;
       unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
       if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY];
       if (idx < valid_count) search_mask_ |= (1U << KEY);
@@ -248,7 +248,7 @@ class BlockRadixTopKRegister {
     int target_k = k;
     int prefix_k = 0;
 
-    for (int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0;
+    for (auto digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0;
          digit_pos -= RADIX_BITS) {
       UpdateSharedBins(unsigned_keys, digit_pos, prefix_k);
       InclusiveScanBins();

diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -445,7 +445,7 @@ void LaunchBroadcastKernel(
   const int blocks = 8;
   int read_lens = configs[0].buf_len;
   auto stream = dev_ctx.x_context()->xpu_stream;
-  int main_offset = (numel / (read_lens * threads)) * read_lens * threads;
+  auto main_offset = (numel / (read_lens * threads)) * read_lens * threads;
   int tail_tid = numel % (read_lens * threads);
 
   VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, false>
@@ -465,7 +465,7 @@ void LaunchBroadcastKernel(
   auto stream = dev_ctx.stream();
   auto threads = gpu_config.GetBlockSize();
   auto blocks = gpu_config.block_per_grid;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  auto main_offset = (numel / (VecSize * threads)) * VecSize * threads;
   int tail_tid = numel % (VecSize * threads);
 
   if (classifier.all_elementwise) {

diff --git a/paddle/phi/kernels/funcs/correlation_funcs.cu.h b/paddle/phi/kernels/funcs/correlation_funcs.cu.h
@@ -84,8 +84,8 @@ __global__ void channel_first(const T *input,
   int64_t global_idx = static_cast<int64_t>(blockIdx.x);
   int64_t stride = static_cast<int64_t>(gridDim.x);
 
-  int p_H = H + 2 * pad_size;
-  int p_W = W + 2 * pad_size;
+  auto p_H = H + 2 * pad_size;
+  auto p_W = W + 2 * pad_size;
   int64_t p_dimcw = channel * p_W;
   int64_t p_dimchw = channel * p_H * p_W;