From 830021b67ac412f01eefc6839df56ae306e46790 Mon Sep 17 00:00:00 2001 From: zrr1999 <2742392377@qq.com> Date: Fri, 7 Nov 2025 08:59:53 +0000 Subject: [PATCH 1/2] use auto in intermediate expr --- .../cpu/add_position_encoding_kernel.cc | 5 +- .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/box_coder_kernel.cc | 4 +- .../cpu/broadcast_tensors_grad_kernel.cc | 4 +- paddle/phi/kernels/cpu/conv_util.h | 5 +- .../kernels/cpu/cross_entropy_grad_kernel.cc | 8 +- .../cpu/distribute_fpn_proposals_kernel.cc | 2 +- .../cpu/lookup_table_dequant_kernel.cc | 2 +- paddle/phi/kernels/cpu/lrn_kernel.cc | 2 +- .../phi/kernels/cpu/matrix_rank_tol_kernel.cc | 2 +- .../phi/kernels/cpu/psroi_pool_grad_kernel.cc | 8 +- paddle/phi/kernels/cpu/psroi_pool_kernel.cc | 2 +- paddle/phi/kernels/cpu/rnn_functor.h | 6 +- paddle/phi/kernels/cpu/rnn_grad_kernel.cc | 2 +- .../phi/kernels/cpu/roi_align_grad_kernel.cc | 2 +- .../phi/kernels/cpu/roi_pool_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/roi_pool_kernel.cc | 5 +- .../cpu/sequence_expand_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/svd_kernel.cc | 4 +- paddle/phi/kernels/cpu/unpool_grad_kernel.cc | 4 +- paddle/phi/kernels/cpu/unpool_kernel.cc | 4 +- .../phi/kernels/cpu/viterbi_decode_kernel.cc | 2 +- paddle/phi/kernels/cpu/yolo_loss_kernel.cc | 4 +- paddle/phi/kernels/funcs/aligned_vector.h | 2 +- paddle/phi/kernels/funcs/blas/blas_impl.h | 26 +- paddle/phi/kernels/funcs/block_radix_topk.cuh | 6 +- paddle/phi/kernels/funcs/broadcast_function.h | 4 +- .../phi/kernels/funcs/correlation_funcs.cu.h | 4 +- paddle/phi/kernels/funcs/correlation_funcs.h | 4 +- .../kernels/funcs/deformable_conv_functor.cc | 5 +- .../phi/kernels/funcs/detail/gru_gpu_kernel.h | 4 +- .../kernels/funcs/detection/bbox_util.cu.h | 2 +- .../elementwise/elementwise_op_function.h | 14 +- paddle/phi/kernels/funcs/fc_functor.cu | 12 +- paddle/phi/kernels/funcs/im2col.cc | 25 +- paddle/phi/kernels/funcs/im2col.cu | 69 +-- paddle/phi/kernels/funcs/im2col_cfo_cpu.h | 20 +- paddle/phi/kernels/funcs/index_put_utils.h | 2 +- paddle/phi/kernels/funcs/jit/gen/seqpool.cc | 3 +- paddle/phi/kernels/funcs/jit/gen_base.cc | 3 +- .../funcs/jit/more/intrinsic/crf_decoding.cc | 2 +- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 19 +- paddle/phi/kernels/funcs/math/beam_search.cu | 7 +- .../phi/kernels/funcs/math/context_project.h | 8 +- paddle/phi/kernels/funcs/math/tree2col.cu | 3 +- paddle/phi/kernels/funcs/math/unpooling.cc | 8 +- paddle/phi/kernels/funcs/matrix_inverse.cu | 2 +- paddle/phi/kernels/funcs/matrix_solve.h | 4 +- paddle/phi/kernels/funcs/maxouting.cc | 2 +- .../kernels/funcs/multi_tensor_apply_util.h | 8 +- .../kernels/funcs/multihead_matmul_functor.cu | 6 +- paddle/phi/kernels/funcs/norm_utils.cu.h | 3 +- paddle/phi/kernels/funcs/pooling.cu | 4 +- paddle/phi/kernels/funcs/sequence_padding.cc | 8 +- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 7 +- paddle/phi/kernels/funcs/stack_functor.h | 8 +- .../phi/kernels/funcs/sync_batch_norm_utils.h | 15 +- .../phi/kernels/funcs/top_k_function_cuda.h | 2 +- .../phi/kernels/funcs/transpose_function.cu.h | 17 +- paddle/phi/kernels/funcs/unsqueeze.h | 6 +- paddle/phi/kernels/funcs/vol2col.cc | 14 +- paddle/phi/kernels/funcs/vol2col.cu | 38 +- .../kernels/funcs/weight_dequant_functor.h | 12 +- paddle/phi/kernels/funcs/weight_only_gemv.cu | 10 +- .../cpu/fused_embedding_fc_lstm_kernel.cc | 3 +- .../kernels/fusion/cpu/fusion_gru_kernel.cc | 2 +- .../kernels/fusion/cpu/fusion_lstm_kernel.cc | 5 +- .../cpu/fusion_seqconv_eltadd_relu_kernel.cc | 2 +- .../fusion/cpu/self_dp_attention_kernel.cc | 10 +- .../fusion/cutlass/conv2d/conv2d_util.cu | 18 +- .../threadblock/epilogue_tensor_op_int32.h | 16 +- .../gemm/kernel/fpA_intB_gemm.h | 5 +- .../gemm/kernel/fpA_intB_gemm_split_k.h | 2 +- .../warp/mma_tensorop_compute_B_with_f16.h | 4 +- .../gemm/warp/mma_tensorop_dequantizer.h | 3 +- .../fpA_intB_gemm/fpA_intB_gemm_template.h | 9 +- .../epilogue/epilogue_pipelined.h | 8 +- .../gemm/attention_scaling_coefs_updater.h | 40 +- .../gemm/mma_accum_lambda_iterator.h | 38 +- .../gemm/mma_from_smem.h | 26 +- .../epilogue_predicated_tile_iterator.h | 51 +-- ...cated_tile_access_iterator_residual_last.h | 2 +- .../predicated_tile_iterator_residual_last.h | 16 +- .../iterators/warp_iterator_from_smem.h | 2 +- .../kernel_backward.h | 10 +- paddle/phi/kernels/fusion/gpu/block_attn.h | 401 ++++++++++-------- .../gpu/block_multi_head_attention_kernel.cu | 3 +- paddle/phi/kernels/fusion/gpu/fmha_ref.h | 12 +- .../gpu/fused_gate_attention_grad_kernel.cu | 12 +- .../fusion/gpu/fused_gate_attention_kernel.cu | 12 +- .../fused_layernorm_residual_dropout_bias.h | 10 +- .../gpu/fused_multi_transformer_kernel.cu | 6 +- .../gpu/fused_multi_transformer_op.cu.h | 212 +++++---- .../gpu/fused_seqpool_cvm_grad_kernel.cu | 4 +- .../fusion/gpu/fused_seqpool_cvm_kernel.cu | 2 +- .../gpu/fused_softmax_mask_grad_kernel.cu | 6 +- .../fusion/gpu/fused_softmax_mask_kernel.cu | 19 +- ...softmax_mask_upper_triangle_grad_kernel.cu | 2 +- ...used_softmax_mask_upper_triangle_kernel.cu | 2 +- .../fused_weighted_swiglu_act_quant_kernel.cu | 2 +- .../gpu/masked_multihead_attention_kernel.cu | 70 +-- .../fusion/gpu/multihead_matmul_kernel.cu | 22 +- .../fusion/gpu/qkv_unpack_mha_kernel.cu | 14 +- .../fusion/onednn/fusion_gru_kernel.cc | 6 +- .../kernels/fusion/onednn/fusion_rnn_onednn.h | 6 +- .../xpu/block_multi_head_attention_kernel.cc | 5 +- .../embedding_with_eltwise_add_xpu_kernel.cc | 4 +- .../kernels/gpu/affine_channel_grad_kernel.cu | 7 +- .../gpu/broadcast_tensors_grad_kernel.cu | 4 +- .../kernels/gpu/class_center_sample_kernel.cu | 2 +- .../kernels/gpu/correlation_grad_kernel.cu | 8 +- paddle/phi/kernels/gpu/correlation_kernel.cu | 6 +- .../kernels/gpu/cross_entropy_grad_kernel.cu | 6 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 4 +- paddle/phi/kernels/gpu/depthwise_conv.h | 227 +++++----- paddle/phi/kernels/gpu/determinant_kernel.cu | 2 +- .../gpu/distribute_fpn_proposals_kernel.cu | 2 +- .../phi/kernels/gpu/edit_distance_kernel.cu | 12 +- paddle/phi/kernels/gpu/elementwise_grad.h | 2 +- .../phi/kernels/gpu/flash_attn_v3_kernel.cu | 22 +- paddle/phi/kernels/gpu/flash_attn_v3_utils.cu | 4 +- .../kernels/gpu/generate_proposals_kernel.cu | 2 +- .../phi/kernels/gpu/global_gather_kernel.cu | 2 +- .../phi/kernels/gpu/global_scatter_kernel.cu | 2 +- .../phi/kernels/gpu/group_norm_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/group_norm_kernel.cu | 4 +- paddle/phi/kernels/gpu/instance_norm_utils.h | 3 +- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 2 +- paddle/phi/kernels/gpu/lrn_grad_kernel.cu | 10 +- paddle/phi/kernels/gpu/lrn_kernel.cu | 9 +- .../phi/kernels/gpu/multiclass_nms3_kernel.cu | 15 +- paddle/phi/kernels/gpu/multinomial_kernel.cu | 2 +- paddle/phi/kernels/gpu/norm_kernel.cu | 3 +- paddle/phi/kernels/gpu/prior_box_kernel.cu | 8 +- .../phi/kernels/gpu/psroi_pool_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/psroi_pool_kernel.cu | 6 +- .../kernels/gpu/repeat_interleave_kernel.cu | 2 +- .../phi/kernels/gpu/rms_norm_grad_kernel.cu | 13 +- paddle/phi/kernels/gpu/roll_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/roll_kernel.cu | 2 +- .../phi/kernels/gpu/row_conv_grad_kernel.cu | 4 +- .../gpu/sequence_expand_grad_kernel.cu | 2 +- .../phi/kernels/gpu/sequence_expand_kernel.cu | 2 +- paddle/phi/kernels/gpu/shuffle_channel.h | 6 +- .../gpu/shuffle_channel_grad_kernel.cu | 2 +- .../phi/kernels/gpu/shuffle_channel_kernel.cu | 2 +- .../phi/kernels/gpu/slogdeterminant_kernel.cu | 2 +- .../phi/kernels/gpu/top_p_sampling_kernel.cu | 2 +- paddle/phi/kernels/gpu/tril_indices_kernel.cu | 2 +- paddle/phi/kernels/gpu/unpool_grad_kernel.cu | 4 +- .../phi/kernels/gpu/viterbi_decode_kernel.cu | 2 +- .../gpu/weighted_sample_neighbors_kernel.cu | 10 +- .../phi/kernels/gpu/yolo_box_head_kernel.cu | 5 +- .../phi/kernels/gpu/yolo_box_post_kernel.cu | 10 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 8 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 4 +- .../gpudnn/conv_transpose_grad_kernel.cu | 8 +- .../kernels/gpudnn/conv_transpose_kernel.cu | 4 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 4 +- .../impl/anchor_generator_kernel_impl.h | 2 +- .../impl/broadcast_tensors_kernel_impl.h | 4 +- .../kernels/impl/cholesky_grad_kernel_impl.h | 27 +- .../impl/collect_fpn_proposals_kernel_impl.h | 7 +- paddle/phi/kernels/impl/diag_embed_impl.h | 4 +- .../phi/kernels/impl/fold_grad_kernel_impl.h | 14 +- paddle/phi/kernels/impl/fold_kernel_impl.h | 14 +- .../kernels/impl/im2sequence_kernel_impl.h | 5 +- .../impl/llm_int8_matmul_kernel_impl.h | 18 +- .../kernels/impl/matrix_power_kernel_impl.h | 3 +- .../kernels/impl/unstack_grad_kernel_impl.h | 2 +- .../impl/weight_quantize_kernel_gpu_impl.h | 4 +- .../impl/weight_quantize_kernel_impl.h | 10 +- paddle/phi/kernels/onednn/concat_kernel.cc | 2 +- .../phi/kernels/onednn/matmul_grad_kernel.cc | 4 +- paddle/phi/kernels/onednn/multi_gru_kernel.cc | 12 +- .../phi/kernels/onednn/reduce_kernel_impl.h | 2 +- .../kernels/primitive/compute_primitives.h | 4 +- .../primitive/datamover_primitives_xpu2.h | 10 +- .../stride/reduce_grad_stride_kernel.cu | 2 +- paddle/phi/kernels/strings/gpu/copy_utils.h | 2 +- 180 files changed, 1217 insertions(+), 1024 deletions(-) mode change 100755 => 100644 paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc diff --git a/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc b/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc index 4b1dbee20c6aec..c023ea2a82345d 100644 --- a/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc +++ b/paddle/phi/kernels/cpu/add_position_encoding_kernel.cc @@ -76,8 +76,9 @@ void AddPositionEncodingKernel(const Context& dev_ctx, const int half_size = enc_size / 2; for (int i = 0; i < batch_size; ++i) { - const int max_length = - x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i]; + const auto max_length(x_lod.empty() ? max_seq_len + : x_lod[0][i + 1] - x_lod[0][i]); + for (int j = 0; j < max_length; ++j) { for (int k = 0; k < half_size; ++k) { const double val = diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc index ecc3cc4df61b13..1d00c63ab76599 100644 --- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -181,7 +181,7 @@ void BatchNormGradFunctor(const Context& dev_ctx, bias_arr.setZero(); } - int scale_coeff = use_global_stats ? 1 : N * sample_size; + auto scale_coeff = use_global_stats ? 1 : N * sample_size; const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coeff; DenseTensor dy_sum; diff --git a/paddle/phi/kernels/cpu/box_coder_kernel.cc b/paddle/phi/kernels/cpu/box_coder_kernel.cc index 6a0998a3bfd088..cdb60de18e9642 100644 --- a/paddle/phi/kernels/cpu/box_coder_kernel.cc +++ b/paddle/phi/kernels/cpu/box_coder_kernel.cc @@ -120,7 +120,7 @@ void DecodeCenterSize(const DenseTensor *target_box, std::array var_data{1., 1., 1., 1.}; T *var_ptr = var_data.data(); size_t offset = i * col * len + j * len; - int prior_box_offset = axis == 0 ? j * len : i * len; + auto prior_box_offset = axis == 0 ? j * len : i * len; T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + @@ -135,7 +135,7 @@ void DecodeCenterSize(const DenseTensor *target_box, T target_box_center_x = 0, target_box_center_y = 0; T target_box_width = 0, target_box_height = 0; - int prior_var_offset = axis == 0 ? j * len : i * len; + auto prior_var_offset = axis == 0 ? j * len : i * len; if (var_size == 2) { std::memcpy(var_ptr, prior_box_var->data() + prior_var_offset, diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc index 40964b6b447c42..31880d0160094d 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -112,8 +112,8 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx, std::vector reduce_dims_vec; std::vector reshape_dims_vec; for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; + auto out_axis = out_rank - j - 1; + auto in_axis = in_rank - j - 1; reshape_dims_vec.push_back(static_cast(input_dims[j])); if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h index af17fb06c6ec90..df497513d04dec 100644 --- a/paddle/phi/kernels/cpu/conv_util.h +++ b/paddle/phi/kernels/cpu/conv_util.h @@ -77,8 +77,9 @@ inline int ConvOutSize(int input_size, int pad_left, int pad_right, int stride) { - const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = + const auto dkernel(dilation * (filter_size - 1) + 1); + + auto output_size = (input_size + (pad_left + pad_right) - dkernel) / stride + 1; PADDLE_ENFORCE_GT( diff --git a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc index f9b3daee2571a4..b2880a1ce33b9f 100644 --- a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc @@ -95,8 +95,8 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx, const int remain = d / axis_dim; for (int i = 0; i < n; ++i) { // for each sample_1_dim for (int j = 0; j < remain; j++) { // for each sample_other_dims - int idx = i * remain + j; // this sample's label_idx. for 1d case, - // remain=1 and j=0, so, idx = i + auto idx = i * remain + j; // this sample's label_idx. for 1d case, + // remain=1 and j=0, so, idx = i auto lbl = static_cast(label_data[idx]); // NOLINT if (lbl == ignore_index) { for (int k = 0; k < axis_dim; ++k) { // for each class id's label @@ -147,8 +147,8 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx, const int remain = d / axis_dim; for (int i = 0; i < n; ++i) { // for each sample_1_dim for (int j = 0; j < remain; j++) { // for each sample_other_dims - int idx = i * remain + j; // this sample's label_idx. for 1d case, - // remain=1 and j=0, so, idx = i + auto idx = i * remain + j; // this sample's label_idx. for 1d case, + // remain=1 and j=0, so, idx = i auto lbl = static_cast(label_data[idx]); // NOLINT if (lbl == ignore_index) { for (int k = 0; k < axis_dim; ++k) { // for each class id's label diff --git a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc index c1c13e1539bdb9..8d83872f1768b5 100644 --- a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc +++ b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc @@ -33,7 +33,7 @@ void DistributeFpnProposalsKernel( std::vector multi_fpn_rois, std::vector multi_level_rois_num, DenseTensor* restore_index) { - const int num_level = max_level - min_level + 1; + const auto num_level(max_level - min_level + 1); // check that the fpn_rois is not empty if (!rois_num.get_ptr()) { diff --git a/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc b/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc index 03f1ecaf162ee1..48d48a6ae4736a 100644 --- a/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc +++ b/paddle/phi/kernels/cpu/lookup_table_dequant_kernel.cc @@ -82,7 +82,7 @@ void LookupTableDequantKernel(const Context &dev_ctx, ids[i])); float min = *(table + ids[i] * quant_number); float max = *(table + ids[i] * quant_number + 1); - int offset = ids[i] * quant_number + 2; + auto offset = ids[i] * quant_number + 2; const unsigned char *tensor_buf = reinterpret_cast(table + offset); dequant( diff --git a/paddle/phi/kernels/cpu/lrn_kernel.cc b/paddle/phi/kernels/cpu/lrn_kernel.cc index d4dfcacdd6a2ca..6efe58243a447d 100644 --- a/paddle/phi/kernels/cpu/lrn_kernel.cc +++ b/paddle/phi/kernels/cpu/lrn_kernel.cc @@ -91,7 +91,7 @@ struct LRNFunctor { } for (int c = 1; c < C; ++c) { // copy previous scale - int mid_offset = i * fea_size + c * img_size; + auto mid_offset = i * fea_size + c * img_size; std::memcpy(mdata + mid_offset, mdata + mid_offset - img_size, img_size * sizeof(T)); diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 56c2459f61e43b..0f08c0a1c7a3c4 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -42,7 +42,7 @@ void LapackSVD(const T* x_data, int mn = std::min(rows, cols); T* a = const_cast(x_data); // NOLINT int lda = rows; - int lwork = 3 * mn + std::max(mx, 7 * mn); + auto lwork = 3 * mn + std::max(mx, 7 * mn); std::vector> rwork( std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn)); std::vector work(lwork); diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc index 4f9cc16890ea79..5959dda73eaaca 100644 --- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc @@ -79,12 +79,12 @@ void PsroiPoolGradKernel(const Context& dev_ctx, int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; + auto n = i / pooled_width / pooled_height / output_channels; // set roi_batch_id int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = + auto input_channel = (c * pooled_height + ph) * pooled_width + pw; + auto input_offset = (roi_batch_id * input_channels + input_channel) * height * width; T* offset_dx_data = dx_data + input_offset; @@ -124,7 +124,7 @@ void PsroiPoolGradKernel(const Context& dev_ctx, T diff_val = is_empty ? 0. : dout_data[i] / bin_area; for (int ih = hstart; ih < hend; ++ih) { for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; + auto input_index = ih * width + iw; offset_dx_data[input_index] += diff_val; } } diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc index db16aa3a541cd0..56241730d39ce1 100644 --- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc @@ -148,7 +148,7 @@ void PsroiPoolKernel(const Context& dev_ctx, wend = std::min(std::max(wend, 0), width); int output_index = out_row_offset + pw; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; + auto input_channel = (c * pooled_height + ph) * pooled_width + pw; int input_plane_offset = static_cast( roi_batch_id * in_stride[0] + input_channel * in_stride[1]); const T* offset_input_data = input_data + input_plane_offset; diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h index d7c1df8a0bb615..538ab4f125f1fa 100644 --- a/paddle/phi/kernels/cpu/rnn_functor.h +++ b/paddle/phi/kernels/cpu/rnn_functor.h @@ -99,7 +99,7 @@ void ResetParameterVector(const std::vector& raw_params_vec, for (int j = 0; j < layer_weight_size; j++) { int k = j % 4; const int& section = j / 4; - int tensor_idx = i * 2 * direction_num + section * 2 + k % 2; + auto tensor_idx = i * 2 * direction_num + section * 2 + k % 2; if (k >= 2) { tensor_idx += bias_start_idx; } @@ -217,8 +217,8 @@ void AllocateReserveData(const Context& dev_ctx, int direction_num = is_bidirec ? 2 : 1; int time_step = input->dims()[0]; int batch_size = input->dims()[1]; - int block_size = direction_num * time_step * batch_size * hidden_size; - int hidden_data_idx = (num_layers - 1); + auto block_size = direction_num * time_step * batch_size * hidden_size; + auto hidden_data_idx = (num_layers - 1); if (is_lstm(mode)) { hidden_data_idx += (gate_num + 2) * num_layers; } else if (is_gru(mode)) { diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc index de355c643b1d9a..5d25d77ebd68aa 100644 --- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc @@ -384,7 +384,7 @@ struct GradLayer { const std::string& mode) { int direction_num = is_bidirec ? 2 : 1; int current_reverse_idx = is_reverse ? 1 : 0; - int current_layer_idx = direction_num * layer_idx + current_reverse_idx; + auto current_layer_idx = direction_num * layer_idx + current_reverse_idx; int begin_idx = 0; if (is_reverse) { begin_idx = time_step; diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc index 00bf2968b0fd5c..4b60feb03ad2f2 100644 --- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc @@ -175,7 +175,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, out_grad_data + n * out_stride[0] + c * out_stride[1]; for (int ph = 0; ph < pooled_height; ++ph) { for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; + auto pool_index = ph * pooled_width + pw; T out_grad_this_bin = batch_out_grad_data[pool_index]; int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc index 465412b40074a9..5c5405d5539308 100644 --- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc @@ -86,7 +86,7 @@ void RoiPoolGradKernel(const Context& dev_ctx, for (int c = 0; c < channels; ++c) { for (int ph = 0; ph < pooled_height; ++ph) { for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; + auto pool_index = ph * pooled_width + pw; if (arg_max_data[pool_index] >= 0) { auto index = arg_max_data[pool_index]; batch_grad_data[index] += out_grad_data[pool_index]; diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc index bdef9c8ec6e840..299a4566fcdd63 100644 --- a/paddle/phi/kernels/cpu/roi_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc @@ -135,7 +135,7 @@ void RoiPoolKernel(const Context& dev_ctx, wstart = std::min(std::max(wstart + box_start_w, 0), width); wend = std::min(std::max(wend + box_start_w, 0), width); - const int pool_index = ph * pooled_width + pw; + const auto pool_index(ph * pooled_width + pw); // Define an empty pooling region to be zero bool is_empty = (hend <= hstart) || (wend <= wstart); @@ -145,7 +145,8 @@ void RoiPoolKernel(const Context& dev_ctx, for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - const int index = h * width + w; + const auto index(h * width + w); + if (batch_data[index] > output_data[pool_index]) { output_data[pool_index] = batch_data[index]; arg_max_data[pool_index] = index; diff --git a/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc b/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc index c1d3356935accd..16fc4232870be9 100644 --- a/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/sequence_expand_grad_kernel.cc @@ -46,7 +46,7 @@ struct SequenceExpandGradFunctor { if (x_seq_len == 0) continue; auto dx_sub = dx->Slice(x_start, x_end); dx_sub.Resize(common::flatten_to_1d(dx_sub.dims())); - int dout_end = dout_offset + repeat_num * x_seq_len; + auto dout_end = dout_offset + repeat_num * x_seq_len; auto dout_sub = dout.Slice(dout_offset, dout_end); dout_sub.Resize({repeat_num, dx_sub.dims()[0]}); phi::funcs::ColwiseSum col_sum; diff --git a/paddle/phi/kernels/cpu/svd_kernel.cc b/paddle/phi/kernels/cpu/svd_kernel.cc index a88e8c98854d9a..0fe409883b3c5b 100644 --- a/paddle/phi/kernels/cpu/svd_kernel.cc +++ b/paddle/phi/kernels/cpu/svd_kernel.cc @@ -82,8 +82,8 @@ void BatchSvd(const T* X, // NOTE: this function is row major, because this function called the lapack. int stride = rows * cols; int k = std::min(rows, cols); - int stride_u = full ? rows * rows : k * rows; - int stride_v = full ? cols * cols : k * cols; + auto stride_u = full ? rows * rows : k * rows; + auto stride_v = full ? cols * cols : k * cols; for (int i = 0; i < batches; ++i) { LapackSvd(X + i * stride, U + i * stride_u, diff --git a/paddle/phi/kernels/cpu/unpool_grad_kernel.cc b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc index afb2dfdcb095c9..960b2da133df9f 100644 --- a/paddle/phi/kernels/cpu/unpool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc @@ -113,8 +113,8 @@ void Unpool3dGrad(const Context& dev_ctx, const int output_depth = static_cast(out.dims()[2]); const int output_height = static_cast(out.dims()[3]); const int output_width = static_cast(out.dims()[4]); - int input_feasize = input_depth * input_height * input_width; - int output_feasize = output_depth * output_height * output_width; + auto input_feasize = input_depth * input_height * input_width; + auto output_feasize = output_depth * output_height * output_width; const IndT* indices_data = indices.data(); for (int b = 0; b < batch_size; ++b) { diff --git a/paddle/phi/kernels/cpu/unpool_kernel.cc b/paddle/phi/kernels/cpu/unpool_kernel.cc index 965698dd8cdd08..e9048e43389985 100644 --- a/paddle/phi/kernels/cpu/unpool_kernel.cc +++ b/paddle/phi/kernels/cpu/unpool_kernel.cc @@ -107,8 +107,8 @@ void Unpool3d(const Context& dev_ctx, const int output_depth = static_cast(out->dims()[2]); const int output_height = static_cast(out->dims()[3]); const int output_width = static_cast(out->dims()[4]); - int input_feasize = input_depth * input_height * input_width; - int output_feasize = output_depth * output_height * output_width; + auto input_feasize = input_depth * input_height * input_width; + auto output_feasize = output_depth * output_height * output_width; const T* input_data = x.data(); const IndT* indices_data = indices.data(); for (int b = 0; b < batch_size; ++b) { diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc index fad1b2ec2b2663..c215b6af5d596e 100644 --- a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc +++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc @@ -168,7 +168,7 @@ void ViterbiDecodeKernel(const Context& dev_ctx, std::vector historys; // We create tensor buffer in order to avoid allocating memory frequently // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero... - int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size; + auto buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size; DenseTensor int_buffer = Empty(dev_ctx, {buffer_size}); funcs::TensorBuffer int_tensor_buffer(int_buffer); // create float tensor buffer diff --git a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc index 96c38a7f1560d0..ee00050b57575d 100644 --- a/paddle/phi/kernels/cpu/yolo_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc @@ -282,7 +282,7 @@ void YoloLossKernel(const Context& dev_ctx, // If best IoU is bigger then ignore_thresh, // ignore the objectness loss. if (best_iou > ignore_thresh) { - int obj_idx = (i * mask_num + j) * stride + k * w + l; + auto obj_idx = (i * mask_num + j) * stride + k * w + l; obj_mask_data[obj_idx] = static_cast(-1); } // all losses should be calculated if best IoU @@ -339,7 +339,7 @@ void YoloLossKernel(const Context& dev_ctx, stride, score); - int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; + auto obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; obj_mask_data[obj_idx] = score; int label = gt_label_data[i * b + t]; diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h index 05733300c9a23c..64958823f053a9 100644 --- a/paddle/phi/kernels/funcs/aligned_vector.h +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -98,7 +98,7 @@ static int GetVectorizedSize(const DenseTensor* tensor) { return 1; } constexpr int max_load_bits = 128; - int valid_vec_size = max_load_bits / CHAR_BIT / element_size; + auto valid_vec_size = max_load_bits / CHAR_BIT / element_size; uint64_t address = reinterpret_cast(tensor->data()); // Currently, decide to deal with no more than 4 data once while adopting diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index 2c5b59ba4b8f6a..4a6376e372ce11 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1620,13 +1620,13 @@ void Blas::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, int sub_width = W2 / head_number; for (int i = 0; i < head_number; i++) { - int sub_matA_offset = (transA == CblasNoTrans) - ? i * (W1 / head_number) - : i * (W1 / head_number) * H1; - int sub_matB_offset = (transB == CblasNoTrans) - ? i * (W2 / head_number) - : i * (W2 / head_number) * H2; - int sub_matC_offset = i * W2 / head_number; + auto sub_matA_offset = (transA == CblasNoTrans) + ? i * (W1 / head_number) + : i * (W1 / head_number) * H1; + auto sub_matB_offset = (transB == CblasNoTrans) + ? i * (W2 / head_number) + : i * (W2 / head_number) * H2; + auto sub_matC_offset = i * W2 / head_number; for (int k = 0; k < batchCount; ++k) { a_array[k] = &A[k * strideA] + sub_matA_offset; b_array[k] = &B[k * strideB] + sub_matB_offset; @@ -1665,12 +1665,12 @@ void Blas::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, int sub_width = W1 / head_number; for (int i = 0; i < head_number; i++) { - int sub_matA_offset = (transA == CblasNoTrans) - ? i * (W1 / head_number) - : i * (W1 / head_number) * H1; - int sub_matB_offset = (transB == CblasNoTrans) - ? i * (W1 / head_number) * W2 - : i * (W1 / head_number); + auto sub_matA_offset = (transA == CblasNoTrans) + ? i * (W1 / head_number) + : i * (W1 / head_number) * H1; + auto sub_matB_offset = (transB == CblasNoTrans) + ? i * (W1 / head_number) * W2 + : i * (W1 / head_number); int sub_matC_offset = i * W2; for (int k = 0; k < batchCount; ++k) { a_array[k] = &A[k * strideA] + sub_matA_offset; diff --git a/paddle/phi/kernels/funcs/block_radix_topk.cuh b/paddle/phi/kernels/funcs/block_radix_topk.cuh index 6958bbe834721f..06870d4bcfce77 100644 --- a/paddle/phi/kernels/funcs/block_radix_topk.cuh +++ b/paddle/phi/kernels/funcs/block_radix_topk.cuh @@ -65,7 +65,7 @@ class BlockRadixTopKGlobalMemory { assert(k < size && k > 0); int target_k = k; UnsignedBits key_pattern = 0; - int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; + auto digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; for (; digit_pos >= 0; digit_pos -= RADIX_BITS) { UpdateSharedBins(data, size, digit_pos, key_pattern); InclusiveScanBins(); @@ -239,7 +239,7 @@ class BlockRadixTopKRegister { #pragma unroll for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { - int idx = KEY * BLOCK_SIZE + tid_; + auto idx = KEY * BLOCK_SIZE + tid_; unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY]; if (idx < valid_count) search_mask_ |= (1U << KEY); @@ -248,7 +248,7 @@ class BlockRadixTopKRegister { int target_k = k; int prefix_k = 0; - for (int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0; + for (auto digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0; digit_pos -= RADIX_BITS) { UpdateSharedBins(unsigned_keys, digit_pos, prefix_k); InclusiveScanBins(); diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 167be9f2e0d74e..85505581514c1b 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -445,7 +445,7 @@ void LaunchBroadcastKernel( const int blocks = 8; int read_lens = configs[0].buf_len; auto stream = dev_ctx.x_context()->xpu_stream; - int main_offset = (numel / (read_lens * threads)) * read_lens * threads; + auto main_offset = (numel / (read_lens * threads)) * read_lens * threads; int tail_tid = numel % (read_lens * threads); VectorizedBroadcastKernel @@ -465,7 +465,7 @@ void LaunchBroadcastKernel( auto stream = dev_ctx.stream(); auto threads = gpu_config.GetBlockSize(); auto blocks = gpu_config.block_per_grid; - int main_offset = (numel / (VecSize * threads)) * VecSize * threads; + auto main_offset = (numel / (VecSize * threads)) * VecSize * threads; int tail_tid = numel % (VecSize * threads); if (classifier.all_elementwise) { diff --git a/paddle/phi/kernels/funcs/correlation_funcs.cu.h b/paddle/phi/kernels/funcs/correlation_funcs.cu.h index db121f7119e702..446688003cbda1 100644 --- a/paddle/phi/kernels/funcs/correlation_funcs.cu.h +++ b/paddle/phi/kernels/funcs/correlation_funcs.cu.h @@ -84,8 +84,8 @@ __global__ void channel_first(const T *input, int64_t global_idx = static_cast(blockIdx.x); int64_t stride = static_cast(gridDim.x); - int p_H = H + 2 * pad_size; - int p_W = W + 2 * pad_size; + auto p_H = H + 2 * pad_size; + auto p_W = W + 2 * pad_size; int64_t p_dimcw = channel * p_W; int64_t p_dimchw = channel * p_H * p_W; diff --git a/paddle/phi/kernels/funcs/correlation_funcs.h b/paddle/phi/kernels/funcs/correlation_funcs.h index 6f2ddc6ab2da3c..745d256233c050 100644 --- a/paddle/phi/kernels/funcs/correlation_funcs.h +++ b/paddle/phi/kernels/funcs/correlation_funcs.h @@ -30,8 +30,8 @@ inline std::vector CorrelationOutputSize(int batch, std::vector output_shape({batch}); int kernel_radius = (kernel_size - 1) / 2; int border_radius = kernel_radius + max_displacement; - int padded_input_height = input_height + 2 * pad_size; - int padded_input_width = input_width + 2 * pad_size; + auto padded_input_height = input_height + 2 * pad_size; + auto padded_input_width = input_width + 2 * pad_size; int output_channel = ((max_displacement / stride2) * 2 + 1) * ((max_displacement / stride2) * 2 + 1); output_shape.push_back(output_channel); diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index 879c3b3a1ddc9d..620729ad06356e 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -86,8 +86,9 @@ inline void ModulatedDeformableIm2colCPUKernel( } *data_col_ptr = val; if (data_mask_ptr) { - const int data_mask_hw_ptr = - ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const auto data_mask_hw_ptr( + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col); + const T mask = data_mask_ptr[data_mask_hw_ptr]; *data_col_ptr *= mask; } diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h index b491cbe120d06f..95c671686d4745 100644 --- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h @@ -129,7 +129,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, T b0[Tiled_size]; int COL = blockIdx.x * blockDim.x + threadIdx.x; - int Tiled_mask = ((1 << Tiled_size) - 1); + auto Tiled_mask = ((1 << Tiled_size) - 1); // Tiled matrix multiply using register shift, faster than sm. if (prev_output_value) { for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) { @@ -191,7 +191,7 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight, T b0[Tiled_size]; T c0 = 0.0f; - int Tiled_mask = ((1 << Tiled_size) - 1); + auto Tiled_mask = ((1 << Tiled_size) - 1); //- Tiled matrix multiply with register shift if (prev_out_value) { for (int k = 0; k < (((frame_size - 1) / Tiled_size) + 1); ++k) { diff --git a/paddle/phi/kernels/funcs/detection/bbox_util.cu.h b/paddle/phi/kernels/funcs/detection/bbox_util.cu.h index f60b6d2e584794..b7ae6124a844f4 100644 --- a/paddle/phi/kernels/funcs/detection/bbox_util.cu.h +++ b/paddle/phi/kernels/funcs/detection/bbox_util.cu.h @@ -214,7 +214,7 @@ static __global__ void FilterBBoxes(const T *bboxes, } __syncthreads(); if (threadIdx.x == 0) { - int size = (num - i) < BlockSize ? num - i : BlockSize; + auto size = (num - i) < BlockSize ? num - i : BlockSize; for (int j = 0; j < size; ++j) { if (keep_index[j] > -1) { keep[cnt++] = keep_index[j]; diff --git a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h index 7859f39aaa48e3..16741164b30783 100644 --- a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h +++ b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h @@ -123,7 +123,7 @@ static void FusedElemwiseAndActBroadcast1CPU(const T *x, T *intermediate_out) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { - int offset = i * w + j; + auto offset = i * w + j; T y_val = BcastY ? y[j] : y[offset]; T x_val = BcastY ? x[offset] : x[j]; @@ -171,7 +171,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, for (int i = 0; i < pre; ++i) { for (int j = 0; j < n; ++j) { for (int k = 0; k < post; ++k) { - int offset = i * n * post + j * post + k; + auto offset = i * n * post + j * post + k; T y_val = BcastY ? y[j] : y[offset]; T x_val = BcastY ? x[offset] : x[j]; @@ -219,7 +219,7 @@ static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel( int j = threadIdx.x; while (j < w) { - int offset = i * w + j; + auto offset = i * w + j; T y_val = BcastY ? y[j] : y[offset]; T x_val = BcastY ? x[offset] : x[j]; @@ -295,7 +295,7 @@ static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel( int k = tid % post; if (i >= pre) break; - int offset = i * n * post + j * post + k; + auto offset = i * n * post + j * post + k; T y_val = BcastY ? y[j] : y[offset]; T x_val = BcastY ? x[offset] : x[j]; @@ -596,7 +596,7 @@ static void FusedElemwiseAndActGradBroadcast1CPU( T zero = static_cast(0); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { - int offset = i * w + j; + auto offset = i * w + j; tmp_out_idx = BcastY ? j : offset; y_idx = BcastY ? j : offset; @@ -694,7 +694,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU( for (int i = 0; i < pre; ++i) { for (int j = 0; j < n; ++j) { for (int k = 0; k < post; ++k) { - int offset = i * n * post + j * post + k; + auto offset = i * n * post + j * post + k; tmp_out_idx = BcastY ? j : offset; y_idx = BcastY ? j : offset; @@ -988,7 +988,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( int k = ttid % post; if (i >= pre) break; - int offset = i * n * post + j * post + k; + auto offset = i * n * post + j * post + k; tmp_out_idx = BcastY ? j : offset; y_idx = BcastY ? j : offset; diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee328a75..b83de76474ce50 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -107,7 +107,8 @@ void AddReluKernel( gpuStream_t stream, const int M, const int N, T* Y, const T* B, bool relu) { if (N % 4 == 0) { const int threads = 256; - const int num = M * N / 4; + const auto num(M * N / 4); + const int blocks = (num + threads - 1) / threads; typedef typename FcTypeTraits::Type trans_type; auto* bias_ptr_v4 = reinterpret_cast(B); @@ -223,8 +224,10 @@ void LaunchBiasAddReluHalf2Kernel(cudaStream_t stream, const float16* B, bool relu) { const int threads = 256; - const int vec_num = rows * cols / (Half2VecSize * 2); - const int half2_num = rows * cols / 2; + const auto vec_num(rows * cols / (Half2VecSize * 2)); + + const auto half2_num(rows * cols / 2); + const int blocks = (vec_num + threads - 1) / threads; // Here reinterpret_cast to half2 type. typedef typename FcTypeTraits::Type trans_type; @@ -308,7 +311,8 @@ void AddReluKernel(gpuStream_t stream, bool relu) { if (N % 4 == 0) { const int threads = 256; - const int num = M * N / 4; + const auto num(M * N / 4); + const int blocks = (num + threads - 1) / threads; typedef typename FcTypeTraits::Type trans_type; auto* bias_ptr_v4 = reinterpret_cast(B); diff --git a/paddle/phi/kernels/funcs/im2col.cc b/paddle/phi/kernels/funcs/im2col.cc index a6478f01c19422..4ccad15f0975e1 100644 --- a/paddle/phi/kernels/funcs/im2col.cc +++ b/paddle/phi/kernels/funcs/im2col.cc @@ -121,7 +121,7 @@ class Col2ImFunctor { common::errors::InvalidArgument("Output_height and padding(padding_up, " "padding_down) are inconsistent.")); - int channels_col = im_channels * filter_height * filter_width; + auto channels_col = im_channels * filter_height * filter_width; T* im_data = im->data(); const T* col_data = col.data(); @@ -131,9 +131,9 @@ class Col2ImFunctor { int h_offset = (c / filter_width) % filter_height; int c_im = c / (filter_width * filter_height); for (int h = 0; h < col_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + auto im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; for (int w = 0; w < col_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + auto im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; if ((im_row_idx) >= 0 && (im_row_idx) < im_height && (im_col_idx) >= 0 && (im_col_idx) < im_width) { int im_offset = 0; @@ -215,14 +215,14 @@ class Im2ColFunctor { for (int channel = 0; channel < im_channels; ++channel) { for (int filter_row_idx = 0; filter_row_idx < filter_height; ++filter_row_idx) { - int im_row_offset = + auto im_row_offset = col_row_idx * stride[0] + filter_row_idx - padding[0]; for (int filter_col_idx = 0; filter_col_idx < filter_width; ++filter_col_idx) { - int im_col_offset = + auto im_col_offset = col_col_idx * stride[1] + filter_col_idx - padding[1]; - int col_offset = + auto col_offset = ((((col_row_idx)*col_width + col_col_idx) * im_channels + channel) * filter_height + @@ -230,8 +230,9 @@ class Im2ColFunctor { filter_width + filter_col_idx; - int im_offset = (channel * im_height + im_row_offset) * im_width + - im_col_offset; + auto im_offset = + (channel * im_height + im_row_offset) * im_width + + im_col_offset; col_data[col_offset] = (im_row_offset < 0 || im_row_offset >= im_height || im_col_offset < 0 || im_col_offset >= im_width) @@ -300,14 +301,14 @@ class Col2ImFunctor { for (int channel = 0; channel < im_channels; ++channel) { for (int filter_row_idx = 0; filter_row_idx < filter_height; ++filter_row_idx) { - int im_row_offset = + auto im_row_offset = col_row_idx * stride[0] + filter_row_idx - padding[0]; for (int filter_col_idx = 0; filter_col_idx < filter_width; ++filter_col_idx) { - int im_col_offset = + auto im_col_offset = col_col_idx * stride[1] + filter_col_idx - padding[1]; - int col_offset = + auto col_offset = (((col_row_idx * col_width + col_col_idx) * im_channels + channel) * filter_height + @@ -317,7 +318,7 @@ class Col2ImFunctor { if (im_row_offset >= 0 && im_row_offset < im_height && im_col_offset >= 0 && im_col_offset < im_width) { - int im_offset = + auto im_offset = (channel * im_height + im_row_offset) * im_width + im_col_offset; im_data[im_offset] += col_data[col_offset]; diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu index cea94f97453d04..74f8037a2f393a 100644 --- a/paddle/phi/kernels/funcs/im2col.cu +++ b/paddle/phi/kernels/funcs/im2col.cu @@ -41,8 +41,8 @@ __global__ void im2col(const T* data_im, int col_width, T* data_col, const DataLayout data_layout) { - int input_channels = num_outs / col_height / col_width; - int channels_col = input_channels * filter_height * filter_width; + auto input_channels = num_outs / col_height / col_width; + auto channels_col = input_channels * filter_height * filter_width; const int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; if (index < num_outs) { @@ -55,15 +55,15 @@ __global__ void im2col(const T* data_im, int channel_in = (data_layout != DataLayout::kNHWC ? index / col_width / col_height : index % input_channels); - int channel_out = channel_in * filter_height * filter_width; - int h_in = h_out * stride_height - padding_height; - int w_in = w_out * stride_width - padding_width; + auto channel_out = channel_in * filter_height * filter_width; + auto h_in = h_out * stride_height - padding_height; + auto w_in = w_out * stride_width - padding_width; data_col += (channel_out * col_height + h_out) * col_width + w_out; for (int i = 0; i < filter_height; ++i) { for (int j = 0; j < filter_width; ++j) { - int rIdx = h_in + i * dilation_h; - int cIdx = w_in + j * dilation_w; + auto rIdx = h_in + i * dilation_h; + auto cIdx = w_in + j * dilation_w; int im_idx; if (data_layout != DataLayout::kNHWC) { im_idx = (channel_in * im_height + rIdx) * im_width + cIdx; @@ -126,7 +126,7 @@ class Im2ColFunctor { int col_height = col->dims()[3]; int col_width = col->dims()[4]; - int num_outputs = im_channels * col_height * col_width; + auto num_outputs = im_channels * col_height * col_width; int num_thread = 1024; #ifdef WITH_NV_JETSON phi::backends::gpu::ChangeThreadNum(dev_ctx, &num_thread); @@ -175,10 +175,11 @@ __global__ void col2im(int n, const int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - const int d_filter_height = dilation_h * (filter_height - 1) + 1; - const int d_filter_width = dilation_w * (filter_width - 1) + 1; + const auto d_filter_height(dilation_h * (filter_height - 1) + 1); - int input_channels = n / im_height / im_width; + const auto d_filter_width(dilation_w * (filter_width - 1) + 1); + + auto input_channels = n / im_height / im_width; if (index < n) { T val = static_cast(0); @@ -193,21 +194,21 @@ __global__ void col2im(int n, : index % input_channels); // compute the start and end of the output - int w_col_start = + auto w_col_start = (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1; int w_col_end = min(w / stride_width + 1, col_width); - int h_col_start = + auto h_col_start = (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1; int h_col_end = min(h / stride_height + 1, col_height); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - int h_off = (h - h_col * stride_height); - int w_off = (w - w_col * stride_width); + auto h_off = (h - h_col * stride_height); + auto w_off = (w - w_col * stride_width); if (h_off % dilation_h == 0 && w_off % dilation_w == 0) { h_off /= dilation_h; w_off /= dilation_w; - int data_col_index = + auto data_col_index = (((c * filter_height + h_off) * filter_width + w_off) * col_height + h_col) * @@ -358,15 +359,15 @@ __global__ void im2colOCF(const T* im_data, channelid += blockDim.z) { for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { - int width_offset = idx + swid * stride_width - padding_width; - int height_offset = idy + shid * stride_height - padding_height; - int im_offset = width_offset + height_offset * im_width + - channelid * im_height * im_width; + auto width_offset = idx + swid * stride_width - padding_width; + auto height_offset = idy + shid * stride_height - padding_height; + auto im_offset = width_offset + height_offset * im_width + + channelid * im_height * im_width; - int col_offset = idx + idy * filter_width + - channelid * filter_height * filter_width + - (shid * col_width + swid) * - (im_channels * filter_height * filter_width); + auto col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * col_width + swid) * + (im_channels * filter_height * filter_width); col_data[col_offset] = (height_offset >= im_height || height_offset < 0 || @@ -430,7 +431,7 @@ class Im2ColFunctor { block_dim_y = 32; } - int block_dim_z = 1024 / block_dim_x / block_dim_y; + auto block_dim_z = 1024 / block_dim_x / block_dim_y; dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels)); dim3 grid(col_width, col_height); im2colOCF<<>>(im.data(), @@ -469,15 +470,15 @@ __global__ void col2imOCF(const T* col_data, channelid += blockDim.z) { for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { - int width_offset = idx + swid * stride_width - padding_width; - int height_offset = idy + shid * stride_height - padding_height; - int im_offset = width_offset + height_offset * im_width + - channelid * im_height * im_width; + auto width_offset = idx + swid * stride_width - padding_width; + auto height_offset = idy + shid * stride_height - padding_height; + auto im_offset = width_offset + height_offset * im_width + + channelid * im_height * im_width; - int col_offset = idx + idy * filter_width + - channelid * filter_height * filter_width + - (shid * col_width + swid) * - (im_channels * filter_height * filter_width); + auto col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * col_width + swid) * + (im_channels * filter_height * filter_width); if (height_offset >= 0 && height_offset < im_height && width_offset >= 0 && width_offset < im_width) { @@ -557,7 +558,7 @@ class Col2ImFunctor { block_dim_y = 32; } - int block_dim_z = 1024 / block_dim_x / block_dim_y; + auto block_dim_z = 1024 / block_dim_x / block_dim_y; dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels)); dim3 grid(col_width, col_height); col2imOCF<<>>(col.data(), diff --git a/paddle/phi/kernels/funcs/im2col_cfo_cpu.h b/paddle/phi/kernels/funcs/im2col_cfo_cpu.h index 1e639f1787cfec..545c01b1947041 100644 --- a/paddle/phi/kernels/funcs/im2col_cfo_cpu.h +++ b/paddle/phi/kernels/funcs/im2col_cfo_cpu.h @@ -42,7 +42,7 @@ inline void im2col_common(const phi::DenseTensor& im, int filter_width = col->dims()[2]; int output_height = col->dims()[3]; int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; + auto channels_col = im_channels * filter_height * filter_width; // Convert dimensions to 64-bit to prevent overflow in arithmetic operations const int64_t im_channels64 = im_channels; @@ -58,9 +58,9 @@ inline void im2col_common(const phi::DenseTensor& im, int h_offset = (c / filter_width) % filter_height; int c_im = c / (filter_width * filter_height); for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + auto im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + auto im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; // Calculate col_idx using 64-bit arithmetic to prevent overflow int64_t col_idx64 = @@ -223,7 +223,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im, } if (data_layout != DataLayout::kNHWC) { // Safe memcpy for filter_width == 1 case - int want = output_width - plw - prw; + auto want = output_width - plw - prw; int avail = im_width; int n = std::max(0, std::min(want, avail)); if (n > 0) { @@ -236,7 +236,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im, } } else { for (int kow = 0; kow < output_width - plw - prw; ++kow) { - int im_row = oh - plh + kh; + auto im_row = oh - plh + kh; int im_col = kow; if (im_row >= 0 && im_row < im_height && im_col >= 0 && im_col < im_width) { @@ -311,7 +311,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im, } } else { for (int kow = 0; kow < output_width - (plw - kw); ++kow) { - int im_row = oh - plh + kh; + auto im_row = oh - plh + kh; int im_col = kow; if (im_row >= 0 && im_row < im_height && im_col >= 0 && im_col < im_width) { @@ -339,8 +339,8 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im, } } else { for (int kow = 0; kow < output_width; ++kow) { - int im_row = oh - plh + kh; - int im_col = kw - plw + kow; + auto im_row = oh - plh + kh; + auto im_col = kw - plw + kow; if (im_row >= 0 && im_row < im_height && im_col >= 0 && im_col < im_width) { dst_data[kow] = @@ -368,8 +368,8 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im, } } else { for (int kow = 0; kow < output_width - i; ++kow) { - int im_row = oh - plh + kh; - int im_col = kw - plw + kow; + auto im_row = oh - plh + kh; + auto im_col = kw - plw + kow; if (im_row >= 0 && im_row < im_height && im_col >= 0 && im_col < im_width) { dst_data[kow] = diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h index 0e14e613468109..d952bf17549a62 100644 --- a/paddle/phi/kernels/funcs/index_put_utils.h +++ b/paddle/phi/kernels/funcs/index_put_utils.h @@ -161,7 +161,7 @@ static phi::DDim BroadCastTensorsDims( int target_dim_size = 1; for (const auto& tensor : tensors) { auto input_ddim = tensor->dims(); - int axis = static_cast(input_ddim.size()) - index - 1; + auto axis = static_cast(input_ddim.size()) - index - 1; int dim_size = 1; if (axis >= 0) { dim_size = input_ddim[axis]; diff --git a/paddle/phi/kernels/funcs/jit/gen/seqpool.cc b/paddle/phi/kernels/funcs/jit/gen/seqpool.cc index 484bff22be4ea5..4ed94f6c1b5fcf 100644 --- a/paddle/phi/kernels/funcs/jit/gen/seqpool.cc +++ b/paddle/phi/kernels/funcs/jit/gen/seqpool.cc @@ -40,7 +40,8 @@ void SeqPoolJitCode::genCode() { vdivps(xmm_t(1), xmm_t(1), xmm_t(0)); vmovss(ptr[reg_tmp], xmm_t(1)); } - const int group_len = max_num_regs * block * sizeof(float); + const auto group_len(max_num_regs * block * sizeof(float)); + for (int g = 0; g < num_groups; ++g) { pool_height(g * group_len, block, max_num_regs); } diff --git a/paddle/phi/kernels/funcs/jit/gen_base.cc b/paddle/phi/kernels/funcs/jit/gen_base.cc index 71701b96f3b640..81ffa663c201cf 100644 --- a/paddle/phi/kernels/funcs/jit/gen_base.cc +++ b/paddle/phi/kernels/funcs/jit/gen_base.cc @@ -81,7 +81,8 @@ std::vector packed_groups(int n, int k, int* block_out, int* rest_out) { } // one for x, one for y, others for z const int max_used_regs_for_n = max_num_regs - 2; - const int aligned_n = n % block == 0 ? n : (n / block + 1) * block; + const auto aligned_n(n % block == 0 ? n : (n / block + 1) * block); + const int num_block = aligned_n / block; const int num_groups = num_block / max_used_regs_for_n; std::vector groups(num_groups, max_used_regs_for_n); diff --git a/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc b/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc index 43a011277cb5ff..3c742b378f4f3d 100644 --- a/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc +++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc @@ -83,7 +83,7 @@ void CRFDecoding(const int seq_len, __m256i max_j = _mm256_set1_epi32(0); #endif /* Calculate the offset of transition_weights.*/ - int trans_offset = state_trans_base_idx * tag_num + j_offset; + auto trans_offset = state_trans_base_idx * tag_num + j_offset; for (int i = 0; i < tag_num; ++i) { /* Initialize the content of alpha variable with related offset.*/ #ifdef __AVX512F__ diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h index 4eae698648996b..1a7e48e6e0301d 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h @@ -223,8 +223,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( const int warp_n = warp % WARPS_N; // 0 const int warp_m = warp / WARPS_N; // 0, 1, 2, 3 - const int c = warp_n * THREADS_PER_WARP + lane; // lane - const int r = bidx * ROWS_PER_CTA + warp_m; // row id + const auto c(warp_n * THREADS_PER_WARP + lane); + // lane + const auto r(bidx * ROWS_PER_CTA + warp_m); + // row id Vec_scale gamma[LDGS]; Vec_scale beta[LDGS]; @@ -874,9 +876,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_fast_final_kernel( const int warp = tidx / THREADS_PER_WARP; const int warp_m = warp / WARPS_N; const int warp_n = warp % WARPS_N; - const int tid_c = warp_n * THREADS_PER_WARP + lane; + const auto tid_c(warp_n * THREADS_PER_WARP + lane); + + const auto c(bidx * THREADS_PER_ROW + tid_c); - const int c = bidx * THREADS_PER_ROW + tid_c; const int r = warp_m; __shared__ U smem_space[(WARPS_M - 1) * THREADS_PER_ROW * VecSize]; @@ -1017,7 +1020,7 @@ void ln_bwd_fast_kernel_driver(const phi::GPUContext &dev_ctx, const int ROWS_PER_CTA = WARPS_M; // 4 * 1024 * 4 - const int SMEM_BYTES = ROWS_PER_CTA * cols * sizeof(U); + const auto SMEM_BYTES(ROWS_PER_CTA * cols * sizeof(U)); // #blocks = 2 * #SM const int gridx = 2 * dev_ctx.GetSMCount(); @@ -1591,7 +1594,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize( VecT temp_grad; #pragma unroll for (int k = 0; k < DataPerTid; ++k) { - const int idx = i * DataPerTid + k; + const auto idx(i * DataPerTid + k); + const U c_h = input_data[idx]; const U c_loss = dout_data[idx]; U f_grad_input = fH * c_loss * gamma_data[idx] - sum_loss1; @@ -1606,7 +1610,8 @@ __global__ void LayerNormBackwardComputeGradInputWithSmallFeatureSize( VecT temp_grad; #pragma unroll for (int k = 0; k < DataPerTid; ++k) { - const int idx = i * DataPerTid + k; + const auto idx(i * DataPerTid + k); + const U c_h = input_data[idx]; const U c_loss = dout_data[idx]; U f_grad_input = fH * c_loss - sum_loss1; diff --git a/paddle/phi/kernels/funcs/math/beam_search.cu b/paddle/phi/kernels/funcs/math/beam_search.cu index 66c0b1951585b1..d51e3424b4419a 100644 --- a/paddle/phi/kernels/funcs/math/beam_search.cu +++ b/paddle/phi/kernels/funcs/math/beam_search.cu @@ -98,7 +98,7 @@ __device__ __forceinline__ int SelectTopBeam(Triple* top_beam, Insert(top_beam_local, tmp, beam_size); } } else { - int index = offset * seq_width + tid_of_seq; + auto index = offset * seq_width + tid_of_seq; if (!IsAccumulated) { float pre_score = pre_scores[offset]; for (int i = tid_of_seq; i < seq_width; i += num_used_threads) { @@ -263,7 +263,8 @@ __device__ void BeamSearchDetails(int64_t* selected_ids, int selected_seq_length = finish_flag ? 0 : num_items; if (MaxSeqs > 1) { - const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + const auto seq_id((MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid); + __shared__ int shared_mem[MaxSeqs]; // [0, MaxSeqs - 1], length of each sequences @@ -322,7 +323,7 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, bool is_accumulated, int num_used_threads) { const int tid = threadIdx.x; - const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid; + const auto seq_id((MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid); int seq_offset_start = static_cast(seq_offsets[seq_id]); int seq_offset_end = static_cast(seq_offsets[seq_id + 1]); diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3206a..90545c3ccbe3df 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h +++ b/paddle/phi/kernels/funcs/math/context_project.h @@ -162,7 +162,7 @@ class ContextProjectFunctor { up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); for (int k = 0; k < padding_rows; ++k) { - int padding_size = + auto padding_size = k + context_length < up_pad ? context_length : up_pad - k; phi::DenseTensor out_t_sub = out_t.Slice( k * context_length, k * context_length + padding_size); @@ -176,7 +176,7 @@ class ContextProjectFunctor { (sequence_height - context_start - context_length) + 1) + 1; int padding_begin = std::max(0, context_start - sequence_height); - int padding_size = + auto padding_size = sequence_height - context_start >= context_length ? 1 : context_length - (sequence_height - context_start); @@ -292,7 +292,7 @@ class ContextProjectGradFunctor { up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); for (int k = 0; k < padding_rows; ++k) { - int padding_size = + auto padding_size = k + context_length < up_pad ? context_length : up_pad - k; phi::DenseTensor out_t_sub = out_t.Slice( k * context_length, k * context_length + padding_size); @@ -309,7 +309,7 @@ class ContextProjectGradFunctor { 0, (sequence_height - context_start - context_length) + 1) + 1; int padding_begin = std::max(0, context_start - sequence_height); - int padding_size = + auto padding_size = sequence_height - context_start >= context_length ? 1 : context_length - (sequence_height - context_start); diff --git a/paddle/phi/kernels/funcs/math/tree2col.cu b/paddle/phi/kernels/funcs/math/tree2col.cu index a388072679e500..849d6ec4b011b2 100644 --- a/paddle/phi/kernels/funcs/math/tree2col.cu +++ b/paddle/phi/kernels/funcs/math/tree2col.cu @@ -34,7 +34,8 @@ __global__ void tree2col(const T* eta, const int patch_id = thread_id / feature_size; const int j = thread_id % feature_size; if (patch_id < n) { - const int begin_o = patch_id * 3 * feature_size; + const auto begin_o(patch_id * 3 * feature_size); + const int begin = index[patch_id * 2], end = index[patch_id * 2 + 1]; T res_l = 0, res_r = 0, res_t = 0; for (int i = begin; i < end; i++) { diff --git a/paddle/phi/kernels/funcs/math/unpooling.cc b/paddle/phi/kernels/funcs/math/unpooling.cc index fffbf8ef7130bc..426622e3bba8a8 100644 --- a/paddle/phi/kernels/funcs/math/unpooling.cc +++ b/paddle/phi/kernels/funcs/math/unpooling.cc @@ -120,8 +120,8 @@ class Unpool3dMaxFunctor { const int output_depth = static_cast(output->dims()[2]); const int output_height = static_cast(output->dims()[3]); const int output_width = static_cast(output->dims()[4]); - int input_feasize = input_depth * input_height * input_width; - int output_feasize = output_depth * output_height * output_width; + auto input_feasize = input_depth * input_height * input_width; + auto output_feasize = output_depth * output_height * output_width; const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = context.template Alloc(output); @@ -168,8 +168,8 @@ class Unpool3dMaxGradFunctor { const int output_depth = static_cast(output.dims()[2]); const int output_height = static_cast(output.dims()[3]); const int output_width = static_cast(output.dims()[4]); - int input_feasize = input_depth * input_height * input_width; - int output_feasize = output_depth * output_height * output_width; + auto input_feasize = input_depth * input_height * input_width; + auto output_feasize = output_depth * output_height * output_width; const int* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = context.template Alloc(input_grad); diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e10122497096fb..2b4fb15b3f48a9 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -67,7 +67,7 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, // Copy the addresses of A and A_inv from host to device, // and allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); + auto num_ints = n < 32 ? batch_size : batch_size * (n + 1); size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int); phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( dev_ctx.GetPlace(), diff --git a/paddle/phi/kernels/funcs/matrix_solve.h b/paddle/phi/kernels/funcs/matrix_solve.h index 27abdf8c2c96a7..5ab94be0ce5142 100644 --- a/paddle/phi/kernels/funcs/matrix_solve.h +++ b/paddle/phi/kernels/funcs/matrix_solve.h @@ -99,14 +99,14 @@ void compute_solve_eigen(const Context& dev_ctx, const auto& a_mat_dims = a.dims(); const int a_rank = a_mat_dims.size(); int n = a_mat_dims[a_rank - 1]; - int a_batch_size = a_rank > 2 ? a.numel() / (n * n) : 1; + auto a_batch_size = a_rank > 2 ? a.numel() / (n * n) : 1; // prepare for b const auto& b_mat_dims = b.dims(); const int b_rank = b_mat_dims.size(); int b_h = n; int b_w = b_mat_dims[b_rank - 1]; - int b_batch_size = b_rank > 2 ? b.numel() / (b_h * b_w) : 1; + auto b_batch_size = b_rank > 2 ? b.numel() / (b_h * b_w) : 1; const T* a_ptr = a.data(); const T* b_ptr = b.data(); diff --git a/paddle/phi/kernels/funcs/maxouting.cc b/paddle/phi/kernels/funcs/maxouting.cc index fca6d8e39553a4..ef9e09dec9dfbf 100644 --- a/paddle/phi/kernels/funcs/maxouting.cc +++ b/paddle/phi/kernels/funcs/maxouting.cc @@ -84,7 +84,7 @@ void MaxOutGradFunctor::operator()( const T* output_grad_data = output_grad.data(); T* input_grad_data = dev_ctx.template Alloc(input_grad); for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; + auto blen = fea_size * output_channels * i; for (int c = 0; c < output_channels; ++c) { int clen = fea_size * c; for (int f = 0; f < fea_size; ++f) { diff --git a/paddle/phi/kernels/funcs/multi_tensor_apply_util.h b/paddle/phi/kernels/funcs/multi_tensor_apply_util.h index e146005c49a697..8523559779f673 100644 --- a/paddle/phi/kernels/funcs/multi_tensor_apply_util.h +++ b/paddle/phi/kernels/funcs/multi_tensor_apply_util.h @@ -89,11 +89,13 @@ static __global__ void MultiTensorApplyCUDAKernel( Args... args) { const int block_id = blockIdx.x; const int tensor_id = meta.tensor_ids[block_id]; - const int chunk_id = static_cast(meta.chunk_ids[block_id]) + - (tensor_id == 0) * meta.start_chunk_id; + const auto chunk_id(static_cast(meta.chunk_ids[block_id]) + + (tensor_id == 0) * meta.start_chunk_id); + const int prev_offset = meta.offsets[tensor_id]; const int next_offset = meta.offsets[tensor_id + 1]; - const int ptr_offset = prev_offset + chunk_id * chunk_size; + const auto ptr_offset(prev_offset + chunk_id * chunk_size); + const int size = min(next_offset - ptr_offset, chunk_size); functor( diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu index b41106a6368d7b..d7f54f6b0c0a4d 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -546,7 +546,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &dev_ctx, seq_len * size_per_head); if (seq_len <= 1024) { - int grid = batch_size * head_num * seq_len; + auto grid = batch_size * head_num * seq_len; int block = seq_len; // Align block to 32, also limit seq_len to max block size. @@ -594,7 +594,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &dev_ctx, qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK); } } else { - int grid = batch_size * head_num * seq_len; + auto grid = batch_size * head_num * seq_len; int block = 512; if (seq_len % 2 == 0) { if (std::is_same::value) { @@ -694,7 +694,7 @@ void MultiheadGPUComputeFunctor::operator()(const phi::GPUContext &dev_ctx, T alpha, T beta) { auto stream = dev_ctx.stream(); - const int tsize = batch * head_num * seq_len * head_size; + const auto tsize(batch * head_num * seq_len * head_size); T *qptr = tptr; T *kptr = qptr + tsize; diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h index 4b1ed6ddb9c9e6..238a306858a12a 100644 --- a/paddle/phi/kernels/funcs/norm_utils.cu.h +++ b/paddle/phi/kernels/funcs/norm_utils.cu.h @@ -459,7 +459,8 @@ void NormDoubleGradFunctor(const DeviceContext &dev_ctx, : x_dims[x_dims.size() - 1]); const int N = x_dims[0]; const int64_t num = X->numel(); - const int sample_size = num / N / C; + const auto sample_size(num / N / C); + phi::DenseTensor scale_tmp; if (!Scale) { scale_tmp.Resize({C}); diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index 06bcee3be384c1..51219bb48392fe 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -1518,8 +1518,8 @@ void Pool3dDirectCUDAFunctor::operator()( const int padding_height = paddings[1]; const int padding_width = paddings[2]; - int nthreads = batch_size * output_channels * output_depth * output_height * - output_width; + auto nthreads = batch_size * output_channels * output_depth * output_height * + output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON thread_num = 512; diff --git a/paddle/phi/kernels/funcs/sequence_padding.cc b/paddle/phi/kernels/funcs/sequence_padding.cc index 3eb20dec6afcd2..b0110d618f4489 100644 --- a/paddle/phi/kernels/funcs/sequence_padding.cc +++ b/paddle/phi/kernels/funcs/sequence_padding.cc @@ -37,7 +37,7 @@ void CopyValidData(phi::DenseTensor* dst_tensor, T* dst_data = dst_tensor->data(); int seq_cpy_gap = step_width; - int pad_cpy_gap = + auto pad_cpy_gap = layout == kBatchLengthWidth ? step_width : seq_num * step_width; for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) { int valid_seq_len = @@ -54,9 +54,9 @@ void CopyValidData(phi::DenseTensor* dst_tensor, pad_seq_len, valid_seq_len)); int seq_data_offset = static_cast(seq_offsets[seq_idx] * step_width); - int pad_data_offset = layout == kBatchLengthWidth - ? seq_idx * pad_seq_len * step_width - : seq_idx * step_width; + auto pad_data_offset = layout == kBatchLengthWidth + ? seq_idx * pad_seq_len * step_width + : seq_idx * step_width; float scale = 1.0f / static_cast(valid_seq_len); for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index f27174d5818186..a218bdb896b14f 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -48,7 +48,7 @@ __global__ void ScatterKernel(const T* input, for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) { int indices_i = i / vec_channels; - int channels_i = i - indices_i * vec_channels; + auto channels_i = i - indices_i * vec_channels; int start = unique_value[indices_i]; int end = indices_i == non_zero_num - 1 ? rulebook_len @@ -89,14 +89,15 @@ __global__ void ScatterKernelV2(const T* input, for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) { int indices_i = i / vec_channels; - int channels_i = i - indices_i * vec_channels; + auto channels_i = i - indices_i * vec_channels; StoreT sums = {static_cast(0)}; phi::Load(out + indices_i * channels + channels_i * VecSize, &sums); for (int it = 0; it < buffer_counts; it++) { int len = index_counts[indices_i + it * non_zero_num]; - const int group_offset = it * kernel_size * non_zero_num; + const auto group_offset(it * kernel_size * non_zero_num); + for (int j = 0; j < len; j++) { const int out_feature_i = index_groups[indices_i * kernel_size + j + group_offset]; diff --git a/paddle/phi/kernels/funcs/stack_functor.h b/paddle/phi/kernels/funcs/stack_functor.h index a84967ad7111b3..d73137b15049ac 100644 --- a/paddle/phi/kernels/funcs/stack_functor.h +++ b/paddle/phi/kernels/funcs/stack_functor.h @@ -26,8 +26,8 @@ struct StackFunctor { HOSTDEVICE void operator()(int idx) { int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; + auto which_x = idx / post_ - i * n_; + auto x_index = i * post_ + idx % post_; y_[idx] = x_[which_x][x_index]; } @@ -45,8 +45,8 @@ struct StackGradFunctor { HOSTDEVICE void operator()(int idx) { int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; + auto which_x = idx / post_ - i * n_; + auto x_index = i * post_ + idx % post_; if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx]; } diff --git a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h index 0715cec7fc8215..bd422351768691 100644 --- a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h @@ -143,8 +143,8 @@ __global__ void KeBackwardLocalStats(const T *dy, BatchNormParamType sum2 = 0.; auto mean = means[k]; for (int i = threadIdx.x; i < N * M; i += blockDim.x) { - int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M - : i * C + k; + auto id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M + : i * C + k; auto g = static_cast>(dy[id]); sum1 += g; auto x_i = static_cast>(x[id]); @@ -187,8 +187,8 @@ __global__ void KeBackwardLocalStats2D(const T *dy, auto mean = means[k]; for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < N * M; i += gridDim.y * blockDim.y) { - int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M - : i * C + k; + auto id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M + : i * C + k; auto g = static_cast>(dy[id]); sum1 += g; auto x_i = static_cast>(x[id]); @@ -247,9 +247,10 @@ static __global__ void KeBNBackwardScaleBias( auto inv_var_i = inv_variance[i]; auto mean_i = mean[i]; for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int id = layout == DataLayout::kNCHW - ? ((j / HxW) * C + i) * HxW + (j % HxW) - : j * outer_size + i; + const auto id(layout == DataLayout::kNCHW + ? ((j / HxW) * C + i) * HxW + (j % HxW) + : j * outer_size + i); + auto x_i = static_cast>(x[id]); auto dy_i = static_cast>(dy[id]); ds_sum += dy_i * (x_i - mean_i); diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index e30d440ff3273c..54702098ae0e52 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h @@ -793,7 +793,7 @@ __device__ void RadixSearch(const T* input, RadixType desired_mask = 0; #pragma unroll - for (int digit_pos = sizeof(T) * 8 - RADIX_BITS; digit_pos >= 0; + for (auto digit_pos = sizeof(T) * 8 - RADIX_BITS; digit_pos >= 0; digit_pos -= RADIX_BITS) { RadixCountUsingMask( input, diff --git a/paddle/phi/kernels/funcs/transpose_function.cu.h b/paddle/phi/kernels/funcs/transpose_function.cu.h index 59daa0b8d73c89..2ba8f750aa5bae 100644 --- a/paddle/phi/kernels/funcs/transpose_function.cu.h +++ b/paddle/phi/kernels/funcs/transpose_function.cu.h @@ -153,7 +153,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, if (x < in_effective_thread_num) { // Read a tile from input using block. int x_i = x / TileY; - int x_j = x - x_i * TileY; + auto x_j = x - x_i * TileY; IndexType input_ind = input_origin_block_flat_index + x_i * input_dims[2] + x_j; IndexType input_inc = BlockReadRows * input_dims[2]; @@ -197,7 +197,7 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, if (x < out_effective_thread_num) { int x_i = x / TileX; - int x_j = x - x_i * TileX; + auto x_j = x - x_i * TileX; IndexType output_ind = output_origin_block_flat_index + x_i * output_dims[2] + x_j; IndexType output_inc = BlockWriteRows * output_dims[2]; @@ -473,7 +473,7 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, // can split input properly, in another words: num_wasted_threads=0. int num_full_tiles = input_long_edge / proposed_tile_long_edge; - int num_wasted_threads = + auto num_wasted_threads = input_long_edge - num_full_tiles * proposed_tile_long_edge; float cost = num_wasted_threads; @@ -951,8 +951,8 @@ struct PermTypeClassifier { type_ = PermuteType::kGeneralTranspose; num_rows_tile_ = GET_TILE_SIZE(dims[rank - 2], kTileSize); int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src); - int tile_size = channel * num_rows_tile_ * - GET_TILE_SIZE(dims[last_idx], kTileSize); + auto tile_size = channel * num_rows_tile_ * + GET_TILE_SIZE(dims[last_idx], kTileSize); vec_size_ = tile_size < sm_count ? 1 : dim_vec_size; } else { type_ = PermuteType::kGeneralPermute; @@ -970,7 +970,7 @@ struct PermTypeClassifier { num_rows_tile_ = GET_TILE_SIZE(dims[0], kTileSize); int dim_vec_size = GetDimVecSize(dst_vec_size, dims[last_idx], src); - int tile_size = + auto tile_size = dims[1] * num_rows_tile_ * GET_TILE_SIZE(dims[2], kTileSize); vec_size_ = tile_size < sm_count ? 1 : dim_vec_size; } else { @@ -1232,7 +1232,7 @@ struct TransposeDataWriter { OutVecT tmp_data[ReadSize]; #pragma unroll for (int i = 0; i < ReadSize; ++i) { - int tile_tail = tile_y * ReadSize + i; + auto tile_tail = tile_y * ReadSize + i; int major_share_idx = share_tile + tile_tail; IndexT row_in_mat = (blockIdx.x * kColTile + tile_tail) * col_stride; #pragma unroll @@ -1266,7 +1266,8 @@ struct TransposeDataWriter { #pragma unroll for (int tile_y = threadIdx.y; tile_y < cols_range; tile_y += kBlockRows) { - const int shared_major = shared_tile + tile_y * ReadSize; + const auto shared_major(shared_tile + tile_y * ReadSize); + const IndexT row_major = (row_tile + tile_y * ReadSize) * col_stride; #pragma unroll for (int i = 0; i < ReadSize; ++i) { diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h index 709f22bec4ec3a..5272f532b53e25 100644 --- a/paddle/phi/kernels/funcs/unsqueeze.h +++ b/paddle/phi/kernels/funcs/unsqueeze.h @@ -54,8 +54,8 @@ inline DDim GetOutputSqueezeShape(const std::vector squeeze_dims, continue; } - int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size() - : squeeze_dims[i]; + auto current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size() + : squeeze_dims[i]; PADDLE_ENFORCE_GE( current, @@ -118,7 +118,7 @@ inline DDim GetUnsqueezeShape(const std::vector unsqz_dims, UNSQUEEZE_MAX_RANK_SUPPORTED)); for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_rank + 1 : axis; + auto cur = axis < 0 ? axis + cur_output_rank + 1 : axis; // Validity Check: the axis bound PADDLE_ENFORCE_GE( cur, diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc index b3ffc6d822ef9f..3c3c8891a5e306 100644 --- a/paddle/phi/kernels/funcs/vol2col.cc +++ b/paddle/phi/kernels/funcs/vol2col.cc @@ -61,7 +61,7 @@ class Vol2ColFunctor { int64_t output_depth = col->dims()[4]; int64_t output_height = col->dims()[5]; int64_t output_width = col->dims()[6]; - int channels_col = + auto channels_col = input_channels * filter_depth * filter_height * filter_width; // changed @@ -187,7 +187,7 @@ class Col2VolFunctor { int output_depth = static_cast(col.dims()[4]); int output_height = static_cast(col.dims()[5]); int output_width = static_cast(col.dims()[6]); - int channels_col = + auto channels_col = input_channels * filter_depth * filter_height * filter_width; bool paddings_size_is_6 = (paddings.size() == 6); @@ -238,13 +238,13 @@ class Col2VolFunctor { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; int d_offset = (c / filter_width / filter_height) % filter_depth; - int cIm = c / filter_width / filter_height / filter_depth; + auto cIm = c / filter_width / filter_height / filter_depth; for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + auto d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + auto h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + auto w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { @@ -259,7 +259,7 @@ class Col2VolFunctor { input_channels + cIm; } - int col_idx = + auto col_idx = ((c * output_depth + d) * output_height + h) * output_width + w; vol_data[vol_idx] += col_data[col_idx]; diff --git a/paddle/phi/kernels/funcs/vol2col.cu b/paddle/phi/kernels/funcs/vol2col.cu index da81d027effc8e..176d3e1d167d8c 100644 --- a/paddle/phi/kernels/funcs/vol2col.cu +++ b/paddle/phi/kernels/funcs/vol2col.cu @@ -56,11 +56,11 @@ __global__ void vol2col(int64_t num_kernels, int w_out = index % output_width; int h_out = (index / output_width) % output_height; int d_out = (index / output_width / output_height) % output_detph; - int channel_in = index / output_width / output_height / output_detph; - int channel_out = channel_in * filter_depth * filter_height * filter_width; - int w_in = w_out * stride_width - padding_width; - int h_in = h_out * stride_height - padding_height; - int d_in = d_out * stride_depth - padding_depth; + auto channel_in = index / output_width / output_height / output_detph; + auto channel_out = channel_in * filter_depth * filter_height * filter_width; + auto w_in = w_out * stride_width - padding_width; + auto h_in = h_out * stride_height - padding_height; + auto d_in = d_out * stride_depth - padding_depth; data_col += ((static_cast(channel_out) * output_detph + d_out) * output_height + @@ -70,9 +70,9 @@ __global__ void vol2col(int64_t num_kernels, for (int k = 0; k < filter_depth; ++k) { for (int i = 0; i < filter_height; ++i) { for (int j = 0; j < filter_width; ++j) { - int d = d_in + k * dilation_d; - int h = h_in + i * dilation_h; - int w = w_in + j * dilation_w; + auto d = d_in + k * dilation_d; + auto h = h_in + i * dilation_h; + auto w = w_in + j * dilation_w; int64_t vol_idx; if (data_layout != DataLayout::kNHWC) { vol_idx = @@ -242,11 +242,13 @@ __global__ void col2vol(int64_t num_kernels, int output_width, T* data_vol, const DataLayout data_layout) { - const int d_filter_depth = dilation_d * (filter_depth - 1) + 1; - const int d_filter_height = dilation_h * (filter_height - 1) + 1; - const int d_filter_width = dilation_w * (filter_width - 1) + 1; + const auto d_filter_depth(dilation_d * (filter_depth - 1) + 1); - int input_channels = num_kernels / depth / height / width; + const auto d_filter_height(dilation_h * (filter_height - 1) + 1); + + const auto d_filter_width(dilation_w * (filter_width - 1) + 1); + + auto input_channels = num_kernels / depth / height / width; for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; index += blockDim.x * gridDim.x) { @@ -264,22 +266,22 @@ __global__ void col2vol(int64_t num_kernels, : index % input_channels); // compute the start and end of the output - int w_col_start = + auto w_col_start = (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1; int w_col_end = min(w / stride_width + 1, output_width); - int h_col_start = + auto h_col_start = (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1; int h_col_end = min(h / stride_height + 1, output_height); - int d_col_start = + auto d_col_start = (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1; int d_col_end = min(d / stride_depth + 1, output_detph); for (int d_col = d_col_start; d_col < d_col_end; ++d_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - int d_off = (d - d_col * stride_depth); - int h_off = (h - h_col * stride_height); - int w_off = (w - w_col * stride_width); + auto d_off = (d - d_col * stride_depth); + auto h_off = (h - h_col * stride_height); + auto w_off = (w - w_col * stride_width); if (d_off % dilation_d == 0 && h_off % dilation_h == 0 && w_off % dilation_w == 0) { d_off /= dilation_d; diff --git a/paddle/phi/kernels/funcs/weight_dequant_functor.h b/paddle/phi/kernels/funcs/weight_dequant_functor.h index 7377cab0ac2db5..5d25b07316758c 100644 --- a/paddle/phi/kernels/funcs/weight_dequant_functor.h +++ b/paddle/phi/kernels/funcs/weight_dequant_functor.h @@ -140,7 +140,7 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight, // elements of the first four and last four threads of each 8 consecutive // threads will come from row 2N and row 2N+1 respectively before // interleaving. - int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0); + auto row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0); weight += tile_id * k * 2; output += row_id * k; float scale = static_cast(scale_list[row_id]); @@ -196,7 +196,7 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight, // elements of the first four and last four threads of each 8 consecutive // threads will come from row 2N and row 2N+1 respectively before // interleaving. - int row_id = tile_id * 4 + ((lane_id % 8) / 2); + auto row_id = tile_id * 4 + ((lane_id % 8) / 2); weight += tile_id * k / 2 * 4; output += row_id * k; float scale = static_cast(scale_list[row_id]); @@ -254,14 +254,14 @@ __global__ void int8_weight_only_dequant(const uint8_t* weight, // elements of the first four and last four threads of each 8 consecutive // threads will come from row 2N and row 2N+1 respectively before // interleaving. - int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0); + auto row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0); weight += tile_id * k * 2; output += row_id * k; scales += row_id; #pragma unroll for (int i = lane_id * 16; i < k * 2; i += 16 * 32) { - int scale_offset = i / 2 / group_size; + auto scale_offset = i / 2 / group_size; float scale = static_cast(scales[scale_offset * n]); Load(&weight[i], &vec_weight); #pragma unroll @@ -314,14 +314,14 @@ __global__ void int4_weight_only_dequant(const uint8_t* weight, // elements of the first four and last four threads of each 8 consecutive // threads will come from row 2N and row 2N+1 respectively before // interleaving. - int row_id = tile_id * 4 + ((lane_id % 8) / 2); + auto row_id = tile_id * 4 + ((lane_id % 8) / 2); weight += tile_id * k / 2 * 4; output += row_id * k; scales += row_id; #pragma unroll for (int i = lane_id * 32; i < k * 4; i += 32 * 32) { Load(&weight[i / 2], &vec_weight); - int scale_offset = i / 4 / group_size; + auto scale_offset = i / 4 / group_size; float scale = static_cast(scales[scale_offset * n]); #pragma unroll for (int p = 0; p < 32; p += Converter::kHalfLength) { diff --git a/paddle/phi/kernels/funcs/weight_only_gemv.cu b/paddle/phi/kernels/funcs/weight_only_gemv.cu index 5cd1560694138a..1f87a0a8b2dd5b 100644 --- a/paddle/phi/kernels/funcs/weight_only_gemv.cu +++ b/paddle/phi/kernels/funcs/weight_only_gemv.cu @@ -316,7 +316,8 @@ __global__ void int8_weight_only_gemv(const T* input, const int warp_id = threadIdx.x / kWarpSize; const int lane_id = threadIdx.x % kWarpSize; const int tile_id = blockIdx.x * blockDim.x / kWarpSize + warp_id; - const int row_id = tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0); + const auto row_id(tile_id * 2 + ((lane_id % 8) > 3 ? 1 : 0)); + weight += tile_id * k * 2; float v = 0.f, scale = static_cast(scale_list[row_id]), v_bias; @@ -922,7 +923,8 @@ __global__ void weight_only_batched_gemv_multi_warp(const T* in, constexpr int Num = Batch * NPerBlock; const int tid = threadIdx.x; const int bid = blockIdx.x; - const int n_start_id = bid * NPerBlock * Interleave; + const auto n_start_id(bid * NPerBlock * Interleave); + using HALF_2_TYPE = typename CUDA_HALF_2_TYPE_TARIS::type; // Calculate the n-dimensional index of the data processed by the current // thread in the interleave tile @@ -1039,7 +1041,7 @@ __global__ void weight_only_batched_gemv_multi_warp(const T* in, #endif bias_v = ConvertFloatFunc::apply(bias[n_start_id + nid]); } - int b = i / NPerBlock / Interleave; + auto b = i / NPerBlock / Interleave; out[b * n + n_start_id + nid] = ConvertDstFunc::apply( GeluActivation::apply(v + bias_v)); } @@ -1066,7 +1068,7 @@ void select_activation_and_bias(const T* input, static constexpr int kInterleave = WeightLayoutDetails::kInterleave; dim3 grid(n / NPerBlock / kInterleave); dim3 block(BlockSize); - int size = sizeof(float) * BlockSize / 32 * Batch * NPerBlock * kInterleave; + auto size = sizeof(float) * BlockSize / 32 * Batch * NPerBlock * kInterleave; if (bias) { if (act_method == "gelu") { weight_only_batched_gemv_multi_warp(batch_starts.size() - 1); - const int offset = tstart * max_bs * D; + const auto offset(tstart * max_bs * D); + batched_input_data = batched_input_data + offset * 4; batched_h_out_data = batched_h_out_data + offset; batched_c_out_data = batched_c_out_data + offset; diff --git a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc index 4ac149b2deae27..72c548bbab8f7e 100644 --- a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc +++ b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc @@ -108,7 +108,7 @@ void SeqCompute(const Context& dev_ctx, hidden_out_data = hidden_out_data + gate_offset; }; for (int i = 0; i < N; ++i) { - int bid = is_reverse ? N - 1 - i : i; + auto bid = is_reverse ? N - 1 - i : i; int seq_len = static_cast(x_lod[0][bid + 1] - x_lod[0][bid]); const T* prev_hidden_data = nullptr; int tstart = 0; diff --git a/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc index c00b55f849d5e3..dadcedf2c27e19 100644 --- a/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc +++ b/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc @@ -135,7 +135,7 @@ void SeqCompute(const Context &dev_ctx, } for (int i = 0; i < N; ++i) { - int bid = is_reverse ? N - 1 - i : i; + auto bid = is_reverse ? N - 1 - i : i; int seq_len = static_cast(x_lod[0][bid + 1] - x_lod[0][bid]); const T *prev_c_data = nullptr; const T *prev_h_data = nullptr; @@ -309,7 +309,8 @@ void BatchCompute(const Context &dev_ctx, // compute kernel part const auto &batch_starts = batched_lod[0]; const int max_seq_len = static_cast(batch_starts.size() - 1); - const int offset = tstart * max_bs * D; + const auto offset(tstart * max_bs * D); + batched_input_data = batched_input_data + offset * 4; batched_h_out_data = batched_h_out_data + offset; batched_c_out_data = batched_c_out_data + offset; diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc index ecd868b872ad05..449b95f6eafdbe 100644 --- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc +++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc @@ -75,7 +75,7 @@ void FusionSeqConvEltAddReluKernel(const Context& dev_ctx, // zero all up_pad and fill data std::memset(dst_data, 0, up_pad * col_mat_w_sz); dst_data = dst_data + up_pad * src_mat_w; - int copy_size = col_mat_w_sz - up_pad * src_mat_w_sz; + auto copy_size = col_mat_w_sz - up_pad * src_mat_w_sz; for (int j = 0; j < up_pad; ++j) { // blas.VCOPY? std::memcpy(dst_data, src_data, copy_size); diff --git a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc index dff41e6d4250cb..7bf2a8ea970e65 100644 --- a/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc +++ b/paddle/phi/kernels/fusion/cpu/self_dp_attention_kernel.cc @@ -80,11 +80,11 @@ void transpose_before_bmm1(const T* qkvBuffer, const T* v_src_each_batch = reinterpret_cast(vBuffer) + blocksize * 3 * i; - int dst_offset_each_bmHead = k * tokenSize * cols_per_bmHead; + auto dst_offset_each_bmHead = k * tokenSize * cols_per_bmHead; int src_offset_each_line = k * cols_per_bmHead; int dst_offset_each_line = j * cols_per_bmHead; - int src_offset_each_bmHead = j * hiddenSize * 3; + auto src_offset_each_bmHead = j * hiddenSize * 3; Tt* q_dst_each_line = q_buffer + i * blocksize + dst_offset_each_bmHead + dst_offset_each_line; @@ -131,7 +131,7 @@ void transpose_after_bmm2(T* Buffer, int dst_offset_each_line = k * hiddenSize; for (int j = 0; j < bmHead; j++) { - int src_offset_each_line = j * tokenSize * cols_per_bmHead; + auto src_offset_each_line = j * tokenSize * cols_per_bmHead; int dst_offset_each_head = j * cols_per_bmHead; Tt* q_dst_each_line = TransBuffer + dst_offset_each_head + @@ -391,13 +391,13 @@ void scaled_dp_attention(const float* query, #else int tid = 0; #endif - int ooffset = + auto ooffset = i * num_head * otsize * head_size + j * otsize * head_size; const float* k = key + ooffset; const float* v = value + ooffset; int q_rblk = std::min(iblk, itsize - m); - int ioffset = + auto ioffset = i * num_head * otsize * head_size + j * otsize * head_size; const float* q = query + ioffset + m * head_size; float* out = output + ioffset + m * head_size; diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu index 6aed60cf1c23b6..c2c8fa31e5d5ee 100644 --- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu @@ -67,10 +67,10 @@ __global__ void naive_conv2d_kernel(const T *input, const T *residual, float alpha, // for leaky_relu OpType op_type) { - int M = batch * oh * ow; + auto M = batch * oh * ow; int N = oc; int kc = ic / groups; - int K = kc * kh * kw; + auto K = kc * kh * kw; int m_i = threadIdx.x + blockIdx.x * blockDim.x; int n_i = threadIdx.y + blockIdx.y * blockDim.y; if (m_i >= M || n_i >= N) return; @@ -79,23 +79,23 @@ __global__ void naive_conv2d_kernel(const T *input, int oh_i = (m_i % (oh * ow)) / ow; int ow_i = (m_i % (oh * ow)) % ow; int oc_i = n_i; - int groups_i = (oc_i / (oc / groups)); + auto groups_i = (oc_i / (oc / groups)); struct logical_coord weight_shape = {oc, kc, kh, kw}; struct logical_coord input_shape = {batch, ic, ih, iw}; - int out_offset = m_i * N + n_i; + auto out_offset = m_i * N + n_i; float *out_ptr = output + out_offset; float sum = 0.f; for (int k_i = 0; k_i < K; k_i++) { - int ic_i = k_i / (kh * kw) + groups_i * kc; + auto ic_i = k_i / (kh * kw) + groups_i * kc; int kh_i = (k_i % (kh * kw)) / kw; int kw_i = (k_i % (kh * kw)) % kw; struct logical_coord weight_index = {oc_i, k_i / (kh * kw), kh_i, kw_i}; - int ih_i = oh_i * stride_h - pad_h + kh_i * dilation_h; - int iw_i = ow_i * stride_w - pad_w + kw_i * dilation_w; + auto ih_i = oh_i * stride_h - pad_h + kh_i * dilation_h; + auto iw_i = ow_i * stride_w - pad_w + kw_i * dilation_w; if (ih_i < 0 || ih_i >= ih) continue; if (iw_i < 0 || iw_i >= iw) continue; @@ -170,7 +170,7 @@ float conv2d_diff_gpu(const ConvAllParams ¶ms, OpType op_type, T a) { int oh = params.oh; int ow = params.ow; - int M = batch * oh * ow; + auto M = batch * oh * ow; int N = oc; constexpr int blockM = 16; @@ -178,7 +178,7 @@ float conv2d_diff_gpu(const ConvAllParams ¶ms, OpType op_type, T a) { uint3 grid = {(M + blockM - 1) / blockM, (N + blockN - 1) / blockN, 1}; uint3 block = {blockM, blockN, 1}; - int output_size = batch * oc * oh * ow; + auto output_size = batch * oc * oh * ow; T *output_from_cutlass = reinterpret_cast(malloc(sizeof(T) * output_size)); cudaMemcpy(output_from_cutlass, diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h index de85ed672ed43b..527cc07b907266 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h @@ -263,12 +263,12 @@ class SharedLoadIteratorMixed { for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { CUTLASS_PRAGMA_UNROLL for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int row_ptr_offset = row * ThreadMap::Delta::kRow * stride_ + - group * ThreadMap::Delta::kGroup * stride_ + - cluster * ThreadMap::Delta::kCluster * stride_ + - pointer_offset / LoadType::kElements; + auto row_ptr_offset = row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup * stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; - int frag_row_idx = + auto frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); @@ -277,13 +277,13 @@ class SharedLoadIteratorMixed { CUTLASS_PRAGMA_UNROLL for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { - int frag_idx = + auto frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; CUTLASS_PRAGMA_UNROLL for (int v = 0; v < kLoadsPerAccess; ++v) { - int vector_idx = (column * ThreadMap::Delta::kColumn / - kElementsPerAccess * kLoadsPerAccess); + auto vector_idx = (column * ThreadMap::Delta::kColumn / + kElementsPerAccess * kLoadsPerAccess); LoadType const* memory_pointer = pointers_[v] + row_ptr_offset; diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h index 77b3c294c5f1ff..73f9f28acca876 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h @@ -501,8 +501,9 @@ struct GemmFpAIntB { threadblock_tile_offset.m() * Mma::Shape::kM, threadblock_tile_offset.n() * Mma::Shape::kN); - int block_idx = threadblock_tile_offset.m() + - threadblock_tile_offset.n() * params.grid_tiled_shape.m(); + auto block_idx = + threadblock_tile_offset.m() + + threadblock_tile_offset.n() * params.grid_tiled_shape.m(); // Construct the semaphore. Semaphore semaphore(params.semaphore + block_idx, thread_idx); diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h index 8ee14f87bdf51e..deb5be2213c260 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h @@ -779,7 +779,7 @@ struct GemmFpAIntBSplitK { int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile(); - int iter_tile_last = + auto iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1; peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first); diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h index 2f41cf3386be07..8a16d3c4e7dcea 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h @@ -268,7 +268,7 @@ class MmaTensorOpComputeBWithF16 { for (int m = 0; m < MmaIterations::kRow; ++m) { int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m); - int n_offsetB = warp_tileB_k_offset + kExpansionFactor * n; + auto n_offsetB = warp_tileB_k_offset + kExpansionFactor * n; if (AccumulatorsInRowMajor) { // matrix B is reordered mma(ptr_D[n + m_serpentine * MmaIterations::kColumn], ptr_A[m_serpentine], @@ -290,7 +290,7 @@ class MmaTensorOpComputeBWithF16 { for (int n = 0; n < MmaIterations::kColumn; ++n) { int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n); - int n_serpentine_offsetB = + auto n_serpentine_offsetB = warp_tileB_k_offset + kExpansionFactor * n_serpentine; if (AccumulatorsInRowMajor) { // matrix B is reordered mma(ptr_D[n_serpentine + m * MmaIterations::kColumn], diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h index e02e79316c460f..6e21d1c0b98799 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h @@ -469,7 +469,8 @@ class MmaTensorOpDequantizer< const int warp_idx_n, const int lane_idx) { const int warp_offset = warp_idx_n * Shape::kN; - const int base_col = lane_idx & 0xF8 + lane_idx % 4; + const auto base_col(lane_idx & 0xF8 + lane_idx % 4); + const int thread_offset = warp_offset + base_col; pointer_ = smem_scales.data() + thread_offset; } diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index ed6f32407f8f67..62fd113e725873 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -162,11 +162,11 @@ void generic_mixed_gemm_kernelLauncher(const T* A, using Gemm = cutlass::gemm::device::GemmUniversalBase; - const int ldb = + const auto ldb( cutlass::platform::is_same::value ? n - : k * GemmKernel::kInterleave; + : k * GemmKernel::kInterleave); typename Gemm::Arguments args( {m, n, k}, @@ -272,11 +272,12 @@ void generic_mixed_gemm_kernelLauncher(const T* A, using Gemm = cutlass::gemm::device::GemmUniversalBase; - const int ldb = + const auto ldb( cutlass::platform::is_same::value ? n - : k * GemmKernel::kInterleave; + : k * GemmKernel::kInterleave); + typename Gemm::Arguments args( cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k}, diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h index 8a491ed727c0ea..dc8a10fbda91af 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/epilogue/epilogue_pipelined.h @@ -605,10 +605,10 @@ class EpiloguePipelined : public EpilogueBase(ref_.data() + @@ -1961,11 +1961,11 @@ struct B2bGemm< for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) { CUTLASS_PRAGMA_UNROLL for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) { - int r = + auto r = Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) + m; - int c = mma_n * Delta::kColumn + n; - int idx = + auto c = mma_n * Delta::kColumn + n; + auto idx = n + Policy::LaneMmaShape::kN * (mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM)); diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h index 9ce029c61733e5..f82d036bde2b97 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h @@ -309,9 +309,9 @@ class PredicatedTileIteratorPrefetch { for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { CUTLASS_PRAGMA_UNROLL for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int row_offset = row * ThreadMap::Delta::kRow + - group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; + auto row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; AccessType* memory_pointer = reinterpret_cast(byte_pointer); @@ -360,13 +360,13 @@ class PredicatedTileIteratorPrefetch { for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { CUTLASS_PRAGMA_UNROLL for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = + auto frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - int row_offset = row * ThreadMap::Delta::kRow + - group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; + auto row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; bool row_guard = ((row_offset + thread_start_row_) < extent_row_); @@ -431,13 +431,13 @@ class PredicatedTileIteratorPrefetch { for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { CUTLASS_PRAGMA_UNROLL for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = + auto frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - int row_offset = row * ThreadMap::Delta::kRow + - group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; + auto row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; bool row_guard = ((row_offset + thread_start_row_) < extent_row_); @@ -517,13 +517,13 @@ class PredicatedTileIteratorPrefetch { for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { CUTLASS_PRAGMA_UNROLL for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = + auto frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - int row_offset = row * ThreadMap::Delta::kRow + - group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; + auto row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; bool row_guard = ((row_offset + thread_start_row_) < extent_row_); @@ -533,9 +533,9 @@ class PredicatedTileIteratorPrefetch { int output_P = output_PQ / convolution_Q; int output_Q = output_PQ % convolution_Q; - int input_row = output_N * 2 * convolution_P * 2 * convolution_Q + - (2 * output_P + add_P) * 2 * convolution_Q + - 2 * output_Q + add_Q; + auto input_row = output_N * 2 * convolution_P * 2 * convolution_Q + + (2 * output_P + add_P) * 2 * convolution_Q + + 2 * output_Q + add_Q; int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float); @@ -592,13 +592,13 @@ class PredicatedTileIteratorPrefetch { for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { CUTLASS_PRAGMA_UNROLL for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = + auto frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - int row_offset = row * ThreadMap::Delta::kRow + - group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; + auto row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; bool row_guard = ((row_offset + thread_start_row_) < extent_row_); @@ -612,9 +612,10 @@ class PredicatedTileIteratorPrefetch { if (output_P > convolution_P - 2) row_add_P = 0; if (output_Q > convolution_Q - 2) row_add_Q = 0; - int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) + - ((output_P + row_add_P) / 2) * (convolution_Q / 2) + - (output_Q + row_add_Q) / 2; + auto input_row = + output_N * (convolution_P / 2) * (convolution_Q / 2) + + ((output_P + row_add_P) / 2) * (convolution_Q / 2) + + (output_Q + row_add_Q) / 2; int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float); diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h index 0ede20eaec1100..2450dc835f7c16 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_access_iterator_residual_last.h @@ -336,7 +336,7 @@ class PredicatedTileAccessIteratorResidualLast::value / 8) + the_predicates.iteration_vector_; - int strided_index = + auto strided_index = gather_offset_strided + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided; diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h index 6e51ede94d11e9..c9b9a8d18446f5 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/predicated_tile_iterator_residual_last.h @@ -382,8 +382,8 @@ class PredicatedTileIteratorResidualLast(address_iterator_.get()) + byte_offset; @@ -1071,8 +1071,8 @@ class PredicatedTileIteratorResidualLast(address_iterator_.get()) + byte_offset; diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h index fc3a8317ab70a6..9e9bdd5f97290f 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/warp_iterator_from_smem.h @@ -186,7 +186,7 @@ class WarpIteratorFromSmem { CUTLASS_PRAGMA_UNROLL for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) { - int access_idx = + auto access_idx = access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx); diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h index df1edc71866203..828927db0d8fd0 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h @@ -1164,12 +1164,12 @@ struct AttentionBackwardKernel { } int32_t key_start = 0; - int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ; + auto key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ; for (; key_start < key_end; key_start += kBlockSizeJ) { output_frags.clear(); int32_t query_start = getQueryStart(p, key_start); - int32_t query_end = query_start + (p.num_queries - query_start) / - kBlockSizeI * kBlockSizeI; + auto query_end = query_start + (p.num_queries - query_start) / + kBlockSizeI * kBlockSizeI; for (; query_start < query_end; query_start += kBlockSizeI) { processBlockIJ(shared_storage, output_frags, @@ -1243,7 +1243,7 @@ struct AttentionBackwardKernel { CUTLASS_PRAGMA_UNROLL for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) { - int key = key_start + j + (thread_id / kThreadsPerKey); + auto key = key_start + j + (thread_id / kThreadsPerKey); if (!skipBoundsChecks && key >= p.num_keys) { continue; } @@ -1781,7 +1781,7 @@ struct AttentionBackwardKernel { bool isFirst = key_start == 0; int col_id = col / MatmulGradQ::ThreadblockShape::kN; - int storage_id = + auto storage_id = (col_id + query_start / kBlockSizeI * ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN)); diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h index 9b27233f5dff1d..a23b01e685138b 100644 --- a/paddle/phi/kernels/fusion/gpu/block_attn.h +++ b/paddle/phi/kernels/fusion/gpu/block_attn.h @@ -120,8 +120,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( act_time_step += params.pre_cache_length; - const int *block_table = - params.block_tables + bi * params.max_num_blocks_per_seq; + const auto *block_table(params.block_tables + + bi * params.max_num_blocks_per_seq); typedef PDDataTypeTraits traits_; typedef typename traits_::DataType DataType_; @@ -134,7 +134,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( extern __shared__ char smem_[]; - int block_smem_offset = + auto block_smem_offset = div_up(params.max_num_blocks_per_seq, 4) * 4 * sizeof(int); float *qk_smem = reinterpret_cast(smem_ + block_smem_offset); @@ -170,10 +170,13 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( v_dequant_scale = static_cast(params.cache_v_dequant_scales[kv_hi]); } - const int bhi = bi * params.q_num_head + hi; - const int ti = - params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1; - const int thi = params.cum_offsets ? ti * params.q_num_head + hi : -1; + const auto bhi(bi * params.q_num_head + hi); + + const auto ti( + params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1); + + const auto thi(params.cum_offsets ? ti * params.q_num_head + hi : -1); + int *block_table_smem = reinterpret_cast(smem_); for (int local_id = tid; local_id < params.max_num_blocks_per_seq; @@ -190,12 +193,12 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( const int physical_block_number = block_table_smem[block_idx]; // cache offset of current token - const int base_cache_offset = - physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh + - kv_hi * BLOCK_SIZE * Dh + block_offset * Dh; + const auto base_cache_offset(physical_block_number * params.kv_num_head * + BLOCK_SIZE * Dh + + kv_hi * BLOCK_SIZE * Dh + block_offset * Dh); // qkv [B, S=1, num_head + 2 * (kv_num_head), head_dim] - int qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh; + auto qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh; constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); static_assert(Dh_MAX % QK_VEC_SIZE == 0, ""); @@ -218,7 +221,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( // Load the current timestep's Q and K, then compute q*k, // with each block computing one head. if (tid < QK_VECS_PER_WARP) { - const int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; + const auto qk_offset(qkv_base_offset + tid * QK_VEC_SIZE); Qk_vec q; zero(q); @@ -234,8 +237,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( } if (params.add_qkv_bias) { - const int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; - const int k_bias_offset = kv_hi * Dh + tid * QK_VEC_SIZE; + const auto q_bias_offset(hi * Dh + tid * QK_VEC_SIZE); + + const auto k_bias_offset(kv_hi * Dh + tid * QK_VEC_SIZE); + Qk_vec q_bias; zero(q_bias); Qk_vec k_bias; @@ -266,16 +271,17 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( } else { int last_dim = Dh / params.rotary_emb_dims; int half_lastdim = last_dim / 2; - int rotary_offset = act_time_step * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = act_time_step * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.rope_stride; int stride = half_lastdim / QK_VEC_SIZE; int stride_all_lastdim = 2 * stride; - int right_id = tid / stride_all_lastdim * stride_all_lastdim + - (tid + stride) % (stride_all_lastdim); - int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; - int k_right_offset = qkv_base_offset + params.q_num_head * Dh + - kv_hi * Dh + right_id * QK_VEC_SIZE; + auto right_id = tid / stride_all_lastdim * stride_all_lastdim + + (tid + stride) % (stride_all_lastdim); + auto q_right_offset = + qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; + auto k_right_offset = qkv_base_offset + params.q_num_head * Dh + + kv_hi * Dh + right_id * QK_VEC_SIZE; Qk_vec q_right; zero(q_right); @@ -313,14 +319,16 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( *reinterpret_cast(&q_smem[tid * QK_VEC_SIZE]) = q; if (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh) { if (CACHE_TYPE == CacheType::INT8) { - const int offset = base_cache_offset + tid * QK_VEC_SIZE; + const auto offset(base_cache_offset + tid * QK_VEC_SIZE); + QK_Packed_Int8_t k_tmp = round_tmp( mul(k_quant_scale, k)); *reinterpret_cast(¶ms.k_cache_I[offset]) = k_tmp; } else { - const int offset = base_cache_offset + tid * QK_VEC_SIZE; + const auto offset(base_cache_offset + tid * QK_VEC_SIZE); + *reinterpret_cast(¶ms.k_cache[offset]) = k; } } @@ -390,9 +398,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { const int physical_block_number = block_table_smem[ti / BLOCK_SIZE]; const int block_offset = ti % BLOCK_SIZE; - const int k_offset = - physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh + - kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki; + const auto k_offset(physical_block_number * params.kv_num_head * + BLOCK_SIZE * Dh + + kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki); + #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { if (ti < act_time_step) { @@ -509,9 +518,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void block_attention_kernel( int physical_block_number = block_table_smem[ti / BLOCK_SIZE]; const int block_offset = ti % BLOCK_SIZE; - const int v_offset = - physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh + - kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi; + const auto v_offset(physical_block_number * params.kv_num_head * + BLOCK_SIZE * Dh + + kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi); + V_vec v; if (CACHE_TYPE == CacheType::INT8) { mul_pointer_v2( @@ -627,8 +637,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( return; } - const int *block_table = - params.block_tables + bi * params.max_num_blocks_per_seq; + const auto *block_table(params.block_tables + + bi * params.max_num_blocks_per_seq); typedef PDDataTypeTraits traits_; typedef typename traits_::DataType DataType_; @@ -641,9 +651,9 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( extern __shared__ char smem_[]; - int block_smem_offset = + auto block_smem_offset = div_up(params.max_num_blocks_per_seq, 4) * 4 * sizeof(int); - int q_smem_offset = + auto q_smem_offset = div_up(Dh_MAX * GQA_SUB_PARTITION_SIZE, 4) * 4 * sizeof(T); T *q_smem = reinterpret_cast(smem_ + block_smem_offset); @@ -678,8 +688,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( v_dequant_scale = static_cast(params.cache_v_dequant_scales[kv_hi]); } - const int ti = - params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1; + const auto ti( + params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1); int *block_table_smem = reinterpret_cast(smem_); @@ -698,12 +708,12 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( const int physical_block_number = block_table_smem[block_idx]; // cache offset of current token - const int base_cache_offset = - physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh + - kv_hi * BLOCK_SIZE * Dh + block_offset * Dh; + const auto base_cache_offset(physical_block_number * params.kv_num_head * + BLOCK_SIZE * Dh + + kv_hi * BLOCK_SIZE * Dh + block_offset * Dh); // qkv [B, S=1, num_head + 2 * (kv_num_head), head_dim] - int qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh; + auto qkv_base_offset = bi * (params.q_num_head + params.kv_num_head * 2) * Dh; constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); static_assert(Dh_MAX % QK_VEC_SIZE == 0, ""); @@ -730,9 +740,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( const int lane_id = tid % WARP_SIZE; if (warp_id < GQA_SUB_PARTITION_SIZE && lane_id < QK_VECS_PER_WARP) { - const int hi = kv_hi * GQA_PARTITION_SIZE + - gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id; - const int qk_offset = qkv_base_offset + lane_id * QK_VEC_SIZE; + const auto hi(kv_hi * GQA_PARTITION_SIZE + + gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id); + + const auto qk_offset(qkv_base_offset + lane_id * QK_VEC_SIZE); Qk_vec q; zero(q); @@ -748,8 +759,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( } if (params.add_qkv_bias) { - const int q_bias_offset = hi * Dh + lane_id * QK_VEC_SIZE; - const int k_bias_offset = kv_hi * Dh + lane_id * QK_VEC_SIZE; + const auto q_bias_offset(hi * Dh + lane_id * QK_VEC_SIZE); + + const auto k_bias_offset(kv_hi * Dh + lane_id * QK_VEC_SIZE); + Qk_vec q_bias; zero(q_bias); Qk_vec k_bias; @@ -780,16 +793,17 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( } else { int last_dim = Dh / params.rotary_emb_dims; int half_lastdim = last_dim / 2; - int rotary_offset = act_time_step * Dh + lane_id * QK_VEC_SIZE; + auto rotary_offset = act_time_step * Dh + lane_id * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.rope_stride; int stride = half_lastdim / QK_VEC_SIZE; int stride_all_lastdim = 2 * stride; - int right_id = lane_id / stride_all_lastdim * stride_all_lastdim + - (lane_id + stride) % (stride_all_lastdim); - int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; - int k_right_offset = qkv_base_offset + params.q_num_head * Dh + - kv_hi * Dh + right_id * QK_VEC_SIZE; + auto right_id = lane_id / stride_all_lastdim * stride_all_lastdim + + (lane_id + stride) % (stride_all_lastdim); + auto q_right_offset = + qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; + auto k_right_offset = qkv_base_offset + params.q_num_head * Dh + + kv_hi * Dh + right_id * QK_VEC_SIZE; Qk_vec q_right; zero(q_right); @@ -829,14 +843,16 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( if (Dh == Dh_MAX || lane_id * QK_VEC_SIZE < Dh) { if (CACHE_TYPE == CacheType::INT8) { - const int offset = base_cache_offset + lane_id * QK_VEC_SIZE; + const auto offset(base_cache_offset + lane_id * QK_VEC_SIZE); + QK_Packed_Int8_t k_tmp = round_tmp( mul(k_quant_scale, k)); *reinterpret_cast(¶ms.k_cache_I[offset]) = k_tmp; } else { - const int offset = base_cache_offset + lane_id * QK_VEC_SIZE; + const auto offset(base_cache_offset + lane_id * QK_VEC_SIZE); + *reinterpret_cast(¶ms.k_cache[offset]) = k; } } @@ -857,9 +873,11 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( if (lane_id == 0) { qk *= params.inv_sqrt_dh; - const int hi = kv_hi * GQA_PARTITION_SIZE + - gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id; - const int bhi = bi * params.q_num_head + hi; + const auto hi(kv_hi * GQA_PARTITION_SIZE + + gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + warp_id); + + const auto bhi(bi * params.q_num_head + hi); + if (params.attn_mask) { auto mask_bhi = bhi; if (params.mask_broadcast_num_heads) { @@ -915,9 +933,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( int physical_block_number = block_table_smem[ti / BLOCK_SIZE]; const int block_offset = ti % BLOCK_SIZE; - const int k_offset = - physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh + - kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki; + const auto k_offset(physical_block_number * params.kv_num_head * + BLOCK_SIZE * Dh + + kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + ki); + #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { if (ti < act_time_step) { @@ -940,8 +959,9 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( } #pragma unroll for (int local_hi = 0; local_hi < GQA_SUB_PARTITION_SIZE; local_hi++) { - const int hi = kv_hi * GQA_PARTITION_SIZE + - gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi; + const auto hi(kv_hi * GQA_PARTITION_SIZE + + gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi); + K_vec q[K_VECS_PER_THREAD]; #pragma unroll for (int i = 0; i < K_VECS_PER_THREAD; ++i) { @@ -952,7 +972,8 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( float qk = Qk_dot::dot(q, k, params.inv_sqrt_dh); if (params.attn_mask) { - const int bhi = bi * params.q_num_head + hi; + const auto bhi(bi * params.q_num_head + hi); + auto mask_bhi = bhi; if (params.mask_broadcast_num_heads) { mask_bhi = bi; @@ -1058,9 +1079,10 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( int physical_block_number = block_table_smem[ti / BLOCK_SIZE]; const int block_offset = ti % BLOCK_SIZE; - const int v_offset = - physical_block_number * params.kv_num_head * BLOCK_SIZE * Dh + - kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi; + const auto v_offset(physical_block_number * params.kv_num_head * + BLOCK_SIZE * Dh + + kv_hi * BLOCK_SIZE * Dh + block_offset * Dh + vi); + V_vec v; if (CACHE_TYPE == CacheType::INT8) { mul_pointer_v2( @@ -1160,13 +1182,17 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel( if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { #pragma unroll for (int local_hi = 0; local_hi < GQA_SUB_PARTITION_SIZE; local_hi++) { - const int hi = kv_hi * GQA_PARTITION_SIZE + - gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi; - const int ti = params.cum_offsets - ? bi * params.seq_len - params.cum_offsets[bi] - : -1; - const int thi = params.cum_offsets ? ti * params.q_num_head + hi : -1; - const int bhi = bi * params.q_num_head + hi; + const auto hi(kv_hi * GQA_PARTITION_SIZE + + gqa_sub_partition_id * GQA_SUB_PARTITION_SIZE + local_hi); + + const auto ti(params.cum_offsets + ? bi * params.seq_len - params.cum_offsets[bi] + : -1); + + const auto thi(params.cum_offsets ? ti * params.q_num_head + hi : -1); + + const auto bhi(bi * params.q_num_head + hi); + #ifdef MMHA_USE_FP32_ACUM_FOR_OUT V_vec tmp_out; convert_from_float(tmp_out, out[local_hi]); @@ -1905,7 +1931,8 @@ __global__ void cache_int8_kernel( if (seq_lens[ori_bi] == 0) continue; const uint32_t ori_seq_id = ori_token_idx % max_seq_len + pre_cache_length; - const int32_t *block_table_now = block_tables + ori_bi * max_blocks_per_seq; + const auto *block_table_now(block_tables + ori_bi * max_blocks_per_seq); + const uint32_t block_idx = block_table_now[ori_seq_id / block_size]; const uint32_t block_offset = ori_seq_id % block_size; @@ -2002,7 +2029,8 @@ __global__ void cache_kernel( if (seq_lens[ori_bi] == 0) continue; const uint32_t ori_seq_id = ori_token_idx % max_seq_len + pre_cache_length; - const int32_t *block_table_now = block_tables + ori_bi * max_blocks_per_seq; + const auto *block_table_now(block_tables + ori_bi * max_blocks_per_seq); + const uint32_t block_idx = block_table_now[ori_seq_id / block_size]; const uint32_t block_offset = ori_seq_id % block_size; @@ -2060,16 +2088,16 @@ __global__ void write_pre_cache_int8_to_cache( linear_index += step) { const int batch_id = linear_index / offset; if (seq_lens[batch_id] == 0) continue; - const int *block_table_now = block_tables + batch_id * max_blocks_per_seq; + const auto *block_table_now(block_tables + batch_id * max_blocks_per_seq); const int32_t cache_seq_id = (linear_index % hidden_size) / head_size; const int32_t head_id = (linear_index % cache_hidden_size) / hidden_size; const int32_t size_id = linear_index % head_size; const int32_t kv_id = (linear_index % offset) / cache_hidden_size; - const int32_t read_id = batch_id * cache_hidden_size + - head_id * hidden_size + cache_seq_id * head_size + - size_id; + const auto read_id(batch_id * cache_hidden_size + head_id * hidden_size + + cache_seq_id * head_size + size_id); + if (kv_id == 0) { phi::Load(&pre_key_cache[read_id], &src_vec); } else { @@ -2079,9 +2107,9 @@ __global__ void write_pre_cache_int8_to_cache( const int block_idx = block_table_now[cache_seq_id / block_size]; const int block_offset = cache_seq_id % block_size; - const int tgt_idx = block_idx * num_heads * block_size * head_size + - head_id * block_size * head_size + - block_offset * head_size + size_id; + const auto tgt_idx(block_idx * num_heads * block_size * head_size + + head_id * block_size * head_size + + block_offset * head_size + size_id); const float scale = kv_id == 0 ? cache_k_scales[head_id] : cache_v_scales[head_id]; @@ -2147,16 +2175,16 @@ __global__ void write_pre_cache_to_cache( linear_index += step) { const int batch_id = linear_index / offset; if (seq_lens[batch_id] == 0) continue; - const int *block_table_now = block_tables + batch_id * max_blocks_per_seq; + const auto *block_table_now(block_tables + batch_id * max_blocks_per_seq); const int32_t cache_seq_id = (linear_index % hidden_size) / head_size; const int32_t head_id = (linear_index % cache_hidden_size) / hidden_size; const int32_t size_id = linear_index % head_size; const int32_t kv_id = (linear_index % offset) / cache_hidden_size; - const int32_t read_id = batch_id * cache_hidden_size + - head_id * hidden_size + cache_seq_id * head_size + - size_id; + const auto read_id(batch_id * cache_hidden_size + head_id * hidden_size + + cache_seq_id * head_size + size_id); + if (kv_id == 0) { phi::Load(&pre_key_cache[read_id], &src_vec); } else { @@ -2166,9 +2194,9 @@ __global__ void write_pre_cache_to_cache( const int block_idx = block_table_now[cache_seq_id / block_size]; const int block_offset = cache_seq_id % block_size; - const int tgt_idx = block_idx * num_heads * block_size * head_size + - head_id * block_size * head_size + - block_offset * head_size + size_id; + const auto tgt_idx(block_idx * num_heads * block_size * head_size + + head_id * block_size * head_size + + block_offset * head_size + size_id); if (kv_id == 0) { phi::Store(src_vec, &key_cache[tgt_idx]); @@ -2208,7 +2236,7 @@ void CacheKernel(const phi::GPUContext &dev_ctx, const int32_t block_size = key_cache_out->dims()[2]; // stage 1: write qkv to cache [pre_cache_length:] - int elem_nums = num_tokens * 2 * kv_num_heads * head_size; // just k and v + auto elem_nums = num_tokens * 2 * kv_num_heads * head_size; // just k and v constexpr int PackSize = 16 / sizeof(T); int pack_num = elem_nums / PackSize; const int blocksize = 128; @@ -2376,9 +2404,9 @@ __global__ void quant_write_cache_int8_kernel( idx += blockDim.x * VecSize) { int token_idx = idx / head_size; int h_offset = idx % head_size; - int linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size + - (qkv_id + head_group_size) * kv_num_heads * head_size + - hi * head_size + h_offset; + auto linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size + + (qkv_id + head_group_size) * kv_num_heads * head_size + + hi * head_size + h_offset; Load(qkv + linear_idx, &in_vec); #pragma unroll @@ -2408,9 +2436,9 @@ __global__ void quant_write_cache_int8_kernel( idx += blockDim.x * VecSize) { int token_idx = idx / head_size; int h_offset = idx % head_size; - int linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size + - (qkv_id + head_group_size) * kv_num_heads * head_size + - hi * head_size + h_offset; + auto linear_idx = token_idx * (2 * kv_num_heads + q_num_heads) * head_size + + (qkv_id + head_group_size) * kv_num_heads * head_size + + hi * head_size + h_offset; Load(qkv + linear_idx, &in_vec); #pragma unroll @@ -2423,7 +2451,8 @@ __global__ void quant_write_cache_int8_kernel( if (ori_bi != b_id) continue; const int ori_seq_id = ori_token_idx % max_seq_len + pre_cache_length; - const int *block_table_now = block_tables + ori_bi * max_blocks_per_seq; + const auto *block_table_now(block_tables + ori_bi * max_blocks_per_seq); + const int block_idx = block_table_now[ori_seq_id / block_size]; const int block_offset = ori_seq_id % block_size; // [max_block_num, num_head, block_size, head_dim/x, x] @@ -2518,8 +2547,9 @@ void DynamicQuantCacheKernel( if (pre_key_cache) { // stage 2: write pre_cache to cache [:pre_cache_length] - const int elem_nums = - batch_size * kv_num_heads * pre_cache_length * head_size * 2; + const auto elem_nums(batch_size * kv_num_heads * pre_cache_length * + head_size * 2); + const int pack_num = elem_nums / PackSize; const int blocksize = 128; int grid_size = 1; @@ -2586,7 +2616,8 @@ __global__ void VariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2; + const auto emb_idx(ori_seq_id * half_lastdim + h_bias / 2); + const int64_t base_idx = token_idx * 3 * hidden_size + qkv_id * hidden_size + hi * last_dim + h_bias; phi::Load(&qkv[base_idx], &src_vec); @@ -2647,10 +2678,12 @@ __global__ void NeoxVariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * last_dim + h_bias; - const int base_idx_left = token_idx * 3 * full_hidden_size + - qkv_id * full_hidden_size + hi * last_dim + - h_bias; + const auto emb_idx(ori_seq_id * last_dim + h_bias); + + const auto base_idx_left(token_idx * 3 * full_hidden_size + + qkv_id * full_hidden_size + hi * last_dim + + h_bias); + const int base_idx_right = base_idx_left + half_lastdim; phi::Load(&qkv[base_idx_left], &left_vec); @@ -2687,7 +2720,7 @@ void rotary_qk_variable( const int input_output_len, const int dim_head, bool use_neox_style = false) { - int elem_nums = token_num * 2 * head_num * dim_head; // just q and k + auto elem_nums = token_num * 2 * head_num * dim_head; // just q and k if (use_neox_style) { elem_nums = token_num * head_num * dim_head; } @@ -2823,10 +2856,12 @@ __global__ void GQANeoxVariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * last_dim + h_bias; - const int base_idx_left = - token_idx * (q_num_head + 2 * kv_num_head) * last_dim + hi * last_dim + - h_bias; + const auto emb_idx(ori_seq_id * last_dim + h_bias); + + const auto base_idx_left(token_idx * (q_num_head + 2 * kv_num_head) * + last_dim + + hi * last_dim + h_bias); + const int base_idx_right = base_idx_left + half_lastdim; phi::Load(&qkv[base_idx_left], &left_vec); @@ -2864,7 +2899,7 @@ void gqa_rotary_qk_variable( const int input_output_len, const int dim_head, bool use_neox_style = false) { - int elem_nums = + auto elem_nums = token_num * (q_head_num + kv_head_num) * dim_head; // just q and k if (use_neox_style) { elem_nums /= 2; @@ -2951,8 +2986,10 @@ __global__ void VariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias; + const auto emb_idx(ori_seq_id * half_lastdim + h_bias / 2); + + const auto bias_idx(qkv_id * hidden_size + hi * last_dim + h_bias); + const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx; phi::Load(&qkv[base_idx], &src_vec); phi::Load(&qkv_biases[bias_idx], &bias_vec); @@ -3033,11 +3070,14 @@ __global__ void NeoxVariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * last_dim + h_bias; - const int bias_idx_left = - qkv_id * full_hidden_size + hi * last_dim + h_bias; + const auto emb_idx(ori_seq_id * last_dim + h_bias); + + const auto bias_idx_left(qkv_id * full_hidden_size + hi * last_dim + + h_bias); + const int bias_idx_right = bias_idx_left + half_lastdim; - const int base_idx_left = token_idx * 3 * full_hidden_size + bias_idx_left; + const auto base_idx_left(token_idx * 3 * full_hidden_size + bias_idx_left); + const int base_idx_right = base_idx_left + half_lastdim; phi::Load(&qkv[base_idx_left], &left_vec); phi::Load(&qkv[base_idx_right], &right_vec); @@ -3093,7 +3133,7 @@ void rotary_qk_variable( const int input_output_len, const int dim_head, bool use_neox_style = false) { - int elem_nums = token_num * 3 * head_num * dim_head; // just q and k + auto elem_nums = token_num * 3 * head_num * dim_head; // just q and k if (use_neox_style) { elem_nums = token_num * 3 * head_num * dim_head / 2; } @@ -3258,10 +3298,13 @@ __global__ void GQANeoxVariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * last_dim + h_bias; - const int bias_idx_left = hi * last_dim + h_bias; + const auto emb_idx(ori_seq_id * last_dim + h_bias); + + const auto bias_idx_left(hi * last_dim + h_bias); + const int bias_idx_right = bias_idx_left + half_lastdim; - const int base_idx_left = token_idx * offset + bias_idx_left; + const auto base_idx_left(token_idx * offset + bias_idx_left); + const int base_idx_right = base_idx_left + half_lastdim; phi::Load(&qkv[base_idx_left], &left_vec); phi::Load(&qkv[base_idx_right], &right_vec); @@ -3318,7 +3361,7 @@ void gqa_rotary_qk_variable( const int input_output_len, const int dim_head, bool use_neox_style = false) { - int elem_nums = + auto elem_nums = token_num * (q_head_num + 2 * kv_head_num) * dim_head; // for all q k v if (use_neox_style) { elem_nums /= 2; @@ -3405,8 +3448,10 @@ __global__ void VariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias; + const auto emb_idx(ori_seq_id * half_lastdim + h_bias / 2); + + const auto bias_idx(qkv_id * hidden_size + hi * last_dim + h_bias); + const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx; phi::Load(&qkv[base_idx], &src_vec); phi::Load(&qkv_biases[bias_idx], &bias_vec); @@ -3477,11 +3522,14 @@ __global__ void NeoxVariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * last_dim + h_bias; - const int bias_idx_left = - qkv_id * full_hidden_size + hi * last_dim + h_bias; + const auto emb_idx(ori_seq_id * last_dim + h_bias); + + const auto bias_idx_left(qkv_id * full_hidden_size + hi * last_dim + + h_bias); + const int bias_idx_right = bias_idx_left + half_lastdim; - const int base_idx_left = token_idx * 3 * full_hidden_size + bias_idx_left; + const auto base_idx_left(token_idx * 3 * full_hidden_size + bias_idx_left); + const int base_idx_right = base_idx_left + half_lastdim; phi::Load(&qkv[base_idx_left], &left_vec); phi::Load(&qkv[base_idx_right], &right_vec); @@ -3528,7 +3576,7 @@ void rotary_qk_variable( const int input_output_len, const int dim_head, bool use_neox_style = false) { - int elem_nums = token_num * 3 * head_num * dim_head; // just q and k + auto elem_nums = token_num * 3 * head_num * dim_head; // just q and k if (use_neox_style) { elem_nums = token_num * 3 * head_num * dim_head / 2; } @@ -3677,10 +3725,13 @@ __global__ void GQANeoxVariableLengthRotaryKernel( const int ori_seq_id = ori_token_idx % seq_len; - const int emb_idx = ori_seq_id * last_dim + h_bias; - const int bias_idx_left = hi * last_dim + h_bias; + const auto emb_idx(ori_seq_id * last_dim + h_bias); + + const auto bias_idx_left(hi * last_dim + h_bias); + const int bias_idx_right = bias_idx_left + half_lastdim; - const int base_idx_left = token_idx * offset + bias_idx_left; + const auto base_idx_left(token_idx * offset + bias_idx_left); + const int base_idx_right = base_idx_left + half_lastdim; phi::Load(&qkv[base_idx_left], &left_vec); phi::Load(&qkv[base_idx_right], &right_vec); @@ -3728,7 +3779,7 @@ void gqa_rotary_qk_variable( const int input_output_len, const int dim_head, bool use_neox_style = false) { - int elem_nums = + auto elem_nums = token_num * (q_head_num + 2 * kv_head_num) * dim_head; // for all q k v if (use_neox_style) { elem_nums /= 2; @@ -3970,19 +4021,21 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf, // [token_num, q_head_num or kv_head_num, size_per_head] if (head_id < q_head_num) { - const int32_t write_idx = token_idx * q_head_num * size_per_head + - head_id * size_per_head + size_id; + const auto write_idx(token_idx * q_head_num * size_per_head + + head_id * size_per_head + size_id); + phi::Store(src_vec, &q_buf[write_idx]); } else { if (head_id < q_head_num + kv_head_num) { - const int32_t write_idx = token_idx * kv_head_num * size_per_head + - (head_id - q_head_num) * size_per_head + - size_id; + const auto write_idx(token_idx * kv_head_num * size_per_head + + (head_id - q_head_num) * size_per_head + size_id); + phi::Store(src_vec, &k_buf[write_idx]); } else { - const int32_t write_idx = + const auto write_idx( token_idx * kv_head_num * size_per_head + - (head_id - q_head_num - kv_head_num) * size_per_head + size_id; + (head_id - q_head_num - kv_head_num) * size_per_head + size_id); + phi::Store(src_vec, &v_buf[write_idx]); } } @@ -4003,8 +4056,9 @@ void qkv_transpose_split(const phi::GPUContext &dev_ctx, const int kv_head_num, const int seq_len, const int size_per_head) { - const int32_t elem_cnt = - token_num * (q_head_num + kv_head_num * 2) * size_per_head; + const auto elem_cnt(token_num * (q_head_num + kv_head_num * 2) * + size_per_head); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ(size_per_head % PackSize, 0, @@ -4065,9 +4119,9 @@ __global__ void write_pre_cache_to_kv_buffer( const int32_t kv_id = (linear_index % fused_hidden_size) / cache_hidden_size; - const int32_t read_id = batch_id * cache_hidden_size + - head_id * hidden_size + cache_seq_id * head_dim + - size_id; + const auto read_id(batch_id * cache_hidden_size + head_id * hidden_size + + cache_seq_id * head_dim + size_id); + if (kv_id == 0) { phi::Load(&pre_key_cache[read_id], &src_vec); } else { @@ -4075,10 +4129,11 @@ __global__ void write_pre_cache_to_kv_buffer( } const int tmp_max_len_this_time = max_len_this_time + pre_cache_length; - const int32_t write_idx = - batch_id * num_head * tmp_max_len_this_time * head_dim + - head_id * tmp_max_len_this_time * head_dim + cache_seq_id * head_dim + - size_id; + const auto write_idx(batch_id * num_head * tmp_max_len_this_time * + head_dim + + head_id * tmp_max_len_this_time * head_dim + + cache_seq_id * head_dim + size_id); + if (kv_id == 0) { phi::Store(src_vec, &k_buf[write_idx]); } else { @@ -4126,28 +4181,31 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf, const int tmp_max_len_this_time = max_len_this_time + (head_id < q_head_num ? 0 : pre_cache_length); - const int tmp_seq_id = - head_id < q_head_num ? seq_id : seq_id + pre_cache_length; + const auto tmp_seq_id(head_id < q_head_num ? seq_id + : seq_id + pre_cache_length); if (head_id < q_head_num) { - const int write_idx = - target_batch_id * q_head_num * tmp_max_len_this_time * size_per_head + - head_id * tmp_max_len_this_time * size_per_head + - tmp_seq_id * size_per_head + size_id; + const auto write_idx(target_batch_id * q_head_num * + tmp_max_len_this_time * size_per_head + + head_id * tmp_max_len_this_time * size_per_head + + tmp_seq_id * size_per_head + size_id); + phi::Store(src_vec, &q_buf[write_idx]); } else if (head_id < q_head_num + kv_head_num) { - const int write_idx = - target_batch_id * kv_head_num * tmp_max_len_this_time * - size_per_head + - (head_id - q_head_num) * tmp_max_len_this_time * size_per_head + - tmp_seq_id * size_per_head + size_id; + const auto write_idx(target_batch_id * kv_head_num * + tmp_max_len_this_time * size_per_head + + (head_id - q_head_num) * tmp_max_len_this_time * + size_per_head + + tmp_seq_id * size_per_head + size_id); + phi::Store(src_vec, &k_buf[write_idx]); } else { - const int write_idx = target_batch_id * kv_head_num * - tmp_max_len_this_time * size_per_head + - (head_id - q_head_num - kv_head_num) * - tmp_max_len_this_time * size_per_head + - tmp_seq_id * size_per_head + size_id; + const auto write_idx(target_batch_id * kv_head_num * + tmp_max_len_this_time * size_per_head + + (head_id - q_head_num - kv_head_num) * + tmp_max_len_this_time * size_per_head + + tmp_seq_id * size_per_head + size_id); + phi::Store(src_vec, &v_buf[write_idx]); } } @@ -4172,7 +4230,7 @@ void qkv_transpose_split( const int seq_len, const int pre_cache_length, const int size_per_head) { - int32_t elem_cnt = token_num * (q_head_num + kv_head_num * 2) * size_per_head; + auto elem_cnt = token_num * (q_head_num + kv_head_num * 2) * size_per_head; constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ(size_per_head % PackSize, @@ -4241,8 +4299,10 @@ __global__ void GetDecoderTensorKernel(const T *qkv_out, i += gridDim.x * blockDim.x * VecSize) { const int bi = i / fused_hidden_size; const int bias_idx = i % fused_hidden_size; - const int ori_token_idx = bi * seq_len - cum_offsets[bi]; - const int src_offset = ori_token_idx * fused_hidden_size + bias_idx; + const auto ori_token_idx(bi * seq_len - cum_offsets[bi]); + + const auto src_offset(ori_token_idx * fused_hidden_size + bias_idx); + if (src_offset >= qkv_out_nums) continue; phi::Load(&qkv_out[src_offset], &src_vec); phi::Store(src_vec, &qkv_out_decoder[i]); @@ -4267,7 +4327,8 @@ __global__ void GetDecoderRoPEKernel(const T *rope_emb, for (int i = global_idx * VecSize; i < elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int bi = i / dim_head; - const int src_offset = bi * seq_len * dim_head + i % dim_head; + const auto src_offset(bi * seq_len * dim_head + i % dim_head); + phi::Load(&rope_cos_emb[src_offset], &src_vec); phi::Store(src_vec, &cos_emb[i]); phi::Load(&rope_sin_emb[src_offset], &src_vec); @@ -4429,9 +4490,10 @@ __global__ void TransposeRemovingPadding(const T *input_data, const int ori_seq_id = ori_token_idx % seq_len; const int ori_head_id = (linear_index % dim_embed) / head_dim; const int ori_head_lane = (linear_index % dim_embed) % head_dim; - const int ori_idx = ori_batch_id * num_head * max_len_this_time * head_dim + - ori_head_id * max_len_this_time * head_dim + - ori_seq_id * head_dim + ori_head_lane; + const auto ori_idx(ori_batch_id * num_head * max_len_this_time * head_dim + + ori_head_id * max_len_this_time * head_dim + + ori_seq_id * head_dim + ori_head_lane); + phi::Load(&input_data[ori_idx], &src_vec); phi::Store(src_vec, &output_data[linear_index]); } @@ -4452,7 +4514,8 @@ void InvokeTransposeRemovePadding(const phi::GPUContext &dev_ctx, // [batch_size, num_head, max_len_this_time, head_dim] -> [token_num, // num_head, head_dim] constexpr int VEC_16B = 16; - const int elem_cnt = token_num * num_head * head_dim; + const auto elem_cnt(token_num * num_head * head_dim); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ( head_dim % PackSize, diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu index dc2d495f7bb18d..f6056dd8215952 100644 --- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu @@ -341,7 +341,8 @@ void DispatchWithDtype( const int kv_num_head = key_cache_dims[1]; const int dim_head = key_cache_dims[3]; const int total_num_head = qkv.dims()[qkv.dims().size() - 1] / dim_head; - const int q_num_head = total_num_head - 2 * kv_num_head; + const auto q_num_head(total_num_head - 2 * kv_num_head); + const int bsz = cum_offsets.dims()[0]; const int max_block_per_seq = block_tables.dims()[1]; VLOG(3) << "bsz: " << bsz << " token_num: " << token_num diff --git a/paddle/phi/kernels/fusion/gpu/fmha_ref.h b/paddle/phi/kernels/fusion/gpu/fmha_ref.h index 98e456f177e27e..7b71bdbbe2817a 100644 --- a/paddle/phi/kernels/fusion/gpu/fmha_ref.h +++ b/paddle/phi/kernels/fusion/gpu/fmha_ref.h @@ -95,9 +95,10 @@ __global__ void TransposeRemovingPadding(const T* input_data, const int ori_seq_id = ori_token_idx % seq_len; const int ori_head_id = (linear_index % dim_embed) / head_dim; const int ori_head_lane = (linear_index % dim_embed) % head_dim; - const int ori_idx = ori_batch_id * num_head * seq_len * head_dim + - ori_head_id * seq_len * head_dim + - ori_seq_id * head_dim + ori_head_lane; + const auto ori_idx(ori_batch_id * num_head * seq_len * head_dim + + ori_head_id * seq_len * head_dim + + ori_seq_id * head_dim + ori_head_lane); + phi::Load(&input_data[ori_idx], &src_vec); phi::Store(src_vec, &output_data[linear_index]); } @@ -116,7 +117,8 @@ void InvokeTransposeRemovePadding(const phi::GPUContext& dev_ctx, // [batch_size, num_head, seq_len, head_dim] -> [token_num, num_head, // head_dim] constexpr int VEC_16B = 16; - const int elem_cnt = token_num * num_head * head_dim; + const auto elem_cnt(token_num * num_head * head_dim); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ( head_dim % PackSize, @@ -535,7 +537,7 @@ class FMHARef { phi::DenseTensor* src_mask_grad_tensor, phi::DenseTensor* qkv_input_grad_tensor) { auto blas = phi::funcs::GetBlas(dev_ctx_); - int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; + auto q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; int k_size = q_size; int softmax_axis = -1; diff --git a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu index 3b3c78e45fad23..24cca298143663 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_grad_kernel.cu @@ -59,8 +59,8 @@ void ComputeMergedQKVMatmulBackward( dev_ctx.Alloc(qkv_weight_grad, qkv_weight_grad->numel() * sizeof(T)); // Gradient of GEMM(query, qkv_weight) - int m = config.batch_size * config.seq_len_m * config.seq_len_r; - int n = 3 * config.num_heads * config.head_dim; + auto m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto n = 3 * config.num_heads * config.head_dim; int k = config.q_dim; auto qkv_compute = phi::fusion::AttnMatMul(dev_ctx, false, true, m, n, k, false); @@ -95,7 +95,7 @@ void ComputeSeparatedQKVMatmulBackward( const auto *key_weight = &key_weight_in; dev_ctx.Alloc(key_weight_grad, key_weight_grad->numel() * sizeof(T)); - int kv_m = config.batch_size * config.seq_len_m * config.m_size; + auto kv_m = config.batch_size * config.seq_len_m * config.m_size; int kv_n = config.num_heads * config.head_dim; int kv_k = config.kv_dim; auto kv_compute = phi::fusion::AttnMatMul( @@ -119,7 +119,7 @@ void ComputeSeparatedQKVMatmulBackward( const auto *query_weight = &query_weight_in; dev_ctx.Alloc(query_weight_grad, query_weight_grad->numel() * sizeof(T)); - int q_m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto q_m = config.batch_size * config.seq_len_m * config.seq_len_r; int q_n = config.num_heads * config.head_dim; int q_k = config.q_dim; auto q_compute = @@ -155,7 +155,7 @@ void ComputeGatingLinearBackward( gate_bias_out.Resize(config.gate_out_dims); dev_ctx.Alloc(&gate_bias_out, gate_bias_out.numel() * sizeof(T)); - int m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.num_heads * config.head_dim; int k = config.q_dim; auto gate_linear = @@ -208,7 +208,7 @@ void ComputeOutputLinearBackward( dev_ctx.Alloc(out_linear_bias_grad, out_linear_bias_grad->numel() * sizeof(T)); - int m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.q_dim; int k = config.num_heads * config.head_dim; auto out_linear = diff --git a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu index d1722a5006ce64..1d896ba7b32a3a 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_gate_attention_kernel.cu @@ -49,8 +49,8 @@ void ComputeMergedQKVMatmulForward( auto *qkv_weight = &qkv_weight_in; // qkv_out = GEMM(query, qkv_weight^T) - int m = config.batch_size * config.seq_len_m * config.seq_len_r; - int n = 3 * config.num_heads * config.head_dim; + auto m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto n = 3 * config.num_heads * config.head_dim; int k = config.q_dim; auto qkv_compute = phi::fusion::AttnMatMul(dev_ctx, false, true, m, n, k, false); @@ -77,7 +77,7 @@ void ComputeSeparatedQKVMatmulForward( // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim] // query_weight: shape=[q_dim, num_heads, head_dim] // query_out: shape=[batch_size, seq_len_m, seq_len_r, num_heads, head_dim] - int q_m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto q_m = config.batch_size * config.seq_len_m * config.seq_len_r; int q_n = config.num_heads * config.head_dim; int q_k = config.q_dim; auto q_compute = @@ -88,7 +88,7 @@ void ComputeSeparatedQKVMatmulForward( // key: shape=[batch_size, seq_len_m, m_size, kv_dim] // key_weight: shape=[kv_dim, num_heads, head_dim] // key_out: shape=[batch_size, seq_len_m, m_size, num_heads, head_dim] - int kv_m = config.batch_size * config.seq_len_m * config.m_size; + auto kv_m = config.batch_size * config.seq_len_m * config.m_size; int kv_n = config.num_heads * config.head_dim; int kv_k = config.kv_dim; auto kv_compute = phi::fusion::AttnMatMul( @@ -116,7 +116,7 @@ void ComputeGatingLinearForward( // and the second gate_bias_out stores the result of the multiplication + // bias. // gate_out = GEMM(query, gate_weight) + gate_bias - int m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.num_heads * config.head_dim; int k = config.q_dim; auto gate_linear = @@ -148,7 +148,7 @@ void ComputeOutputLinearForward( const auto *out_linear_bias = &out_linear_bias_in; // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias - int m = config.batch_size * config.seq_len_m * config.seq_len_r; + auto m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.q_dim; int k = config.num_heads * config.head_dim; auto out_linear = diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h index 9d4bb18d559ff6..5ec0b78974d3d8 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h @@ -343,7 +343,7 @@ __global__ void FusedLayernormResidualDropoutBiasInfer( T *layernorm_dst) { int col_id = threadIdx.x; int row_id = blockIdx.x; - int idx = row_id * cols + col_id; + auto idx = row_id * cols + col_id; GPURAND(StatePhilox4_32_10_t) state; GPURAND(_init)(seed, idx, increment, &state); @@ -579,10 +579,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( const int warp_n = warp % WARPS_N; // 0 const int warp_m = warp / WARPS_N; // 0, 1, 2, 3 - const int c = warp_n * THREADS_PER_WARP + lane; // lane - const int r = bidx * ROWS_PER_CTA + warp_m; // row id + const auto c(warp_n * THREADS_PER_WARP + lane); + // lane + const auto r(bidx * ROWS_PER_CTA + warp_m); + // row id - int idx = r * ELTS_PER_ROW + c; + auto idx = r * ELTS_PER_ROW + c; GPURAND(StatePhilox4_32_10_t) state; if (HasDropout) { GPURAND(_init)(seed, idx, increment, &state); diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu index 72c5453b439ff6..ec536b9ca97f29 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu @@ -208,9 +208,9 @@ void FusedMultiTransformerOpKernel( dim_head = trans_qkvw ? qkv_w_dims[2] : qkv_w_dims[3]; } int hidden_size = num_head * dim_head; - int output_size = gqa_group_size <= 0 - ? 3 * hidden_size - : (num_head + 2 * gqa_group_size) * dim_head; + auto output_size = gqa_group_size <= 0 + ? 3 * hidden_size + : (num_head + 2 * gqa_group_size) * dim_head; int input_size = dim_embed; // Set a flag whether need to add Matmul / Layernorm bias. diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h index a8191bc6b4a313..5970013c790e62 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h @@ -153,11 +153,15 @@ __global__ void masked_multihead_attention_kernel( const int hi = blockIdx.x; const int kv_hi = hi / params.gqa_num_per_partitions; // if no gqa, kv_hi = hi - const int bhi = bi * params.num_head + hi; - const int bbhi = bbi * params.beam_width * params.num_head + hi; - const int ti = - params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1; - const int thi = params.cum_offsets ? ti * params.num_head + hi : -1; + const auto bhi(bi * params.num_head + hi); + + const auto bbhi(bbi * params.beam_width * params.num_head + hi); + + const auto ti( + params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1); + + const auto thi(params.cum_offsets ? ti * params.num_head + hi : -1); + const int tid = threadIdx.x; const int bi_seq_len_offset = bi * params.max_seq_length; @@ -170,7 +174,8 @@ __global__ void masked_multihead_attention_kernel( : params.sequence_lengths[bi]; // qkv [B, S=1, num_head + 2 * gqa_group_size, head_dim] - int qkv_base_offset = bi * (params.num_head + 2 * params.gqa_group_size) * Dh; + auto qkv_base_offset = + bi * (params.num_head + 2 * params.gqa_group_size) * Dh; constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); static_assert(Dh_MAX % QK_VEC_SIZE == 0, ""); @@ -194,9 +199,10 @@ __global__ void masked_multihead_attention_kernel( } if (tid < QK_VECS_PER_WARP) { - int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; - const int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; - const int k_bias_offset = kv_hi * Dh + tid * QK_VEC_SIZE; + auto qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; + const auto q_bias_offset(hi * Dh + tid * QK_VEC_SIZE); + + const auto k_bias_offset(kv_hi * Dh + tid * QK_VEC_SIZE); Qk_vec q; zero(q); @@ -234,7 +240,7 @@ __global__ void masked_multihead_attention_kernel( if (!params.neox_rotary_style) { if (params.rotary_emb_dims != 0) { - int rotary_offset = bi * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh; Qk_vec_RoPE cos_emb, sin_emb; @@ -255,16 +261,17 @@ __global__ void masked_multihead_attention_kernel( if (params.rotary_emb_dims != 0) { int last_dim = Dh / params.rotary_emb_dims; int half_lastdim = last_dim / 2; - int rotary_offset = bi * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh; int stride = half_lastdim / QK_VEC_SIZE; int stride_all_lastdim = 2 * stride; - int right_id = tid / stride_all_lastdim * stride_all_lastdim + - (tid + stride) % (stride_all_lastdim); - int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; - int k_right_offset = qkv_base_offset + params.num_head * Dh + - kv_hi * Dh + right_id * QK_VEC_SIZE; + auto right_id = tid / stride_all_lastdim * stride_all_lastdim + + (tid + stride) % (stride_all_lastdim); + auto q_right_offset = + qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; + auto k_right_offset = qkv_base_offset + params.num_head * Dh + + kv_hi * Dh + right_id * QK_VEC_SIZE; Qk_vec q_right; zero(q_right); if (Dh == Dh_MAX || right_id * QK_VEC_SIZE < Dh) { @@ -304,10 +311,10 @@ __global__ void masked_multihead_attention_kernel( int co = tid / QK_VECS_IN_16B; int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE; - int offset = bi * params.gqa_group_size * params.max_seq_length * Dh + - kv_hi * params.max_seq_length * Dh + - co * params.max_seq_length * QK_ELTS_IN_16B + - act_time_step * QK_ELTS_IN_16B + ci; + auto offset = bi * params.gqa_group_size * params.max_seq_length * Dh + + kv_hi * params.max_seq_length * Dh + + co * params.max_seq_length * QK_ELTS_IN_16B + + act_time_step * QK_ELTS_IN_16B + ci; if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) { *reinterpret_cast(¶ms.cache_kv[offset]) = k; } @@ -376,7 +383,7 @@ __global__ void masked_multihead_attention_kernel( zero(k_vec_zero); #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * params.max_seq_length + ti; + auto jj = ii * params.max_seq_length + ti; // get k from the cache_kv, and dequant k for qk operation if (ti < act_time_step) { k[ii] = @@ -803,17 +810,19 @@ __global__ void multi_block_masked_multihead_attention_kernel( const int hi = blockIdx.x; // head_idx const int kv_hi = hi / params.gqa_num_per_partitions; - const int bhi = bi * params.num_head + hi; - const int ti = - params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1; - const int thi = params.cum_offsets ? ti * params.num_head + hi : -1; + const auto bhi(bi * params.num_head + hi); + + const auto ti( + params.cum_offsets ? bi * params.seq_len - params.cum_offsets[bi] : -1); + + const auto thi(params.cum_offsets ? ti * params.num_head + hi : -1); float qk_max = -FLT_MAX; float qk = 0; // qkv [B, S=1, 3, num_head, head_dim] - int qkv_base_offset = bi * (params.num_head + 2 * params.gqa_group_size) * - Dh; // // if no gqa, gqa_group_size = num_head + auto qkv_base_offset = bi * (params.num_head + 2 * params.gqa_group_size) * + Dh; // // if no gqa, gqa_group_size = num_head constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); @@ -836,7 +845,7 @@ __global__ void multi_block_masked_multihead_attention_kernel( } if (tid < QK_VECS_PER_WARP) { - const int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; + const auto qk_offset(qkv_base_offset + tid * QK_VEC_SIZE); Qk_vec q; zero(q); @@ -852,8 +861,10 @@ __global__ void multi_block_masked_multihead_attention_kernel( } if (params.add_qkv_bias) { - const int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; - const int k_bias_offset = kv_hi * Dh + tid * QK_VEC_SIZE; + const auto q_bias_offset(hi * Dh + tid * QK_VEC_SIZE); + + const auto k_bias_offset(kv_hi * Dh + tid * QK_VEC_SIZE); + Qk_vec q_bias; zero(q_bias); Qk_vec k_bias; @@ -874,7 +885,7 @@ __global__ void multi_block_masked_multihead_attention_kernel( if (!params.neox_rotary_style) { if (params.rotary_emb_dims != 0) { - int rotary_offset = bi * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh; Qk_vec_RoPE cos_emb, sin_emb; @@ -895,16 +906,17 @@ __global__ void multi_block_masked_multihead_attention_kernel( if (params.rotary_emb_dims != 0) { int last_dim = Dh / params.rotary_emb_dims; int half_lastdim = last_dim / 2; - int rotary_offset = bi * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.rotary_bsz * Dh; int stride = half_lastdim / QK_VEC_SIZE; int stride_all_lastdim = 2 * stride; - int right_id = tid / stride_all_lastdim * stride_all_lastdim + - (tid + stride) % (stride_all_lastdim); - int q_right_offset = qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; - int k_right_offset = qkv_base_offset + params.num_head * Dh + - kv_hi * Dh + right_id * QK_VEC_SIZE; + auto right_id = tid / stride_all_lastdim * stride_all_lastdim + + (tid + stride) % (stride_all_lastdim); + auto q_right_offset = + qkv_base_offset + hi * Dh + right_id * QK_VEC_SIZE; + auto k_right_offset = qkv_base_offset + params.num_head * Dh + + kv_hi * Dh + right_id * QK_VEC_SIZE; Qk_vec q_right; zero(q_right); if (Dh == Dh_MAX || right_id * QK_VEC_SIZE < Dh) { @@ -944,10 +956,10 @@ __global__ void multi_block_masked_multihead_attention_kernel( if (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh) { int co = tid / QK_VECS_IN_16B; int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE; - int offset = bi * params.gqa_group_size * params.max_seq_length * Dh + - kv_hi * params.max_seq_length * Dh + - co * params.max_seq_length * QK_ELTS_IN_16B + - act_time_step * QK_ELTS_IN_16B + ci; + auto offset = bi * params.gqa_group_size * params.max_seq_length * Dh + + kv_hi * params.max_seq_length * Dh + + co * params.max_seq_length * QK_ELTS_IN_16B + + act_time_step * QK_ELTS_IN_16B + ci; *reinterpret_cast(¶ms.cache_kv[offset]) = k; } @@ -1011,13 +1023,13 @@ __global__ void multi_block_masked_multihead_attention_kernel( for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { // First, move each block to their start position. const int time_now = ti + partition_times_timesteps_per_block; - const int k_offset = + const auto k_offset( bi * params.gqa_group_size * params.max_seq_length * Dh + - kv_hi * params.max_seq_length * Dh + time_now * Dh + ki; + kv_hi * params.max_seq_length * Dh + time_now * Dh + ki); #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * params.max_seq_length + time_now; + auto jj = ii * params.max_seq_length + time_now; if (time_now < act_time_step) { k[ii] = (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.max_seq_length) @@ -1209,9 +1221,10 @@ __global__ void multi_block_masked_multihead_attention_kernel( if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { // Compute the index to store in `partial_out`. - const int32_t store_partial_idx = + const auto store_partial_idx( bi * params.num_head * params.max_num_partitions * Dh + - hi * params.max_num_partitions * Dh + partition_idx * Dh + vi; + hi * params.max_num_partitions * Dh + partition_idx * Dh + vi); + // Actually, we do not need the store_func, just use T vectorized type // `V_vec` to store in params.partial_out. #ifdef MMHA_USE_FP32_ACUM_FOR_OUT @@ -1240,11 +1253,13 @@ __launch_bounds__(THREADS_PER_BLOCK) void multi_block_attention_reduce_kernel( return; } - const int bhi = seq_idx * params.num_head + head_idx; - const int ti = params.cum_offsets - ? seq_idx * params.seq_len - params.cum_offsets[seq_idx] - : -1; - const int thi = params.cum_offsets ? ti * params.num_head + head_idx : -1; + const auto bhi(seq_idx * params.num_head + head_idx); + + const auto ti(params.cum_offsets + ? seq_idx * params.seq_len - params.cum_offsets[seq_idx] + : -1); + + const auto thi(params.cum_offsets ? ti * params.num_head + head_idx : -1); const int num_partitions = div_up(context_len, params.partition_size); if (num_partitions == 1) { @@ -1367,7 +1382,7 @@ inline size_t get_reduce_smem_size_in_bytes( const Masked_multihead_attention_params ¶ms) { const int32_t max_num_partitions = div_up(params.timestep, params.partition_size); - int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); + auto reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); VLOG(1) << "get_reduce_smem_size_in_bytes, reduce_shared_mem_size: " << reduce_shared_mem_size; return reduce_shared_mem_size; @@ -1776,8 +1791,8 @@ void write_cache_kv(const phi::GPUContext &dev_ctx, common::errors::PreconditionNotMet( "dim_head=%d must be divisible by vec_size=%d", dim_head, x)); - int max_size = max_seq_len * dim_head / x; - int size = seq_len * dim_head / x; + auto max_size = max_seq_len * dim_head / x; + auto size = seq_len * dim_head / x; dim3 grid(div_up(max_size, block_sz), bsz, num_head); dim3 grid_v(div_up(size, block_sz), bsz, num_head); @@ -1819,10 +1834,10 @@ __global__ void gqa_write_cache_k_kernel(T *cache_k, const int local_token_id = ori_token_id % seq_len; - const int tgt_idx = ori_bi * gqa_group_size * max_seq_len * dim_head + - head_idx * max_seq_len * dim_head + - head_vec_id * max_seq_len * X_ELEMS + - local_token_id * X_ELEMS; + const auto tgt_idx(ori_bi * gqa_group_size * max_seq_len * dim_head + + head_idx * max_seq_len * dim_head + + head_vec_id * max_seq_len * X_ELEMS + + local_token_id * X_ELEMS); phi::Load(&k[linear_idx], &in_vec); phi::Store(in_vec, &cache_k[tgt_idx]); @@ -1855,9 +1870,9 @@ __global__ void gqa_write_cache_v_kernel(T *cache_v, const int local_token_id = ori_token_id % seq_len; - const int tgt_idx = ori_bi * gqa_group_size * max_seq_len * dim_head + - head_idx * max_seq_len * dim_head + - local_token_id * dim_head + head_offset; + const auto tgt_idx(ori_bi * gqa_group_size * max_seq_len * dim_head + + head_idx * max_seq_len * dim_head + + local_token_id * dim_head + head_offset); phi::Load(&v[linear_idx], &in_vec); phi::Store(in_vec, &cache_v[tgt_idx]); @@ -1955,8 +1970,9 @@ __global__ void fusedQKV_transpose_split_kernel(T *q_buf, const int32_t head_id = (linear_index % hidden_size) / size_per_head; const int32_t size_id = linear_index % size_per_head; - const int32_t write_idx = - token_idx * hidden_size + head_id * size_per_head + size_id; + const auto write_idx(token_idx * hidden_size + head_id * size_per_head + + size_id); + if (qkv_id == 0) { phi::Store(src_vec, &q_buf[write_idx]); } else if (qkv_id == 1) { @@ -1980,7 +1996,8 @@ void qkv_transpose_split(const phi::GPUContext &dev_ctx, const int head_num, const int seq_len, const int size_per_head) { - const int32_t elem_cnt = token_num * head_num * size_per_head * 3; + const auto elem_cnt(token_num * head_num * size_per_head * 3); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ(size_per_head % PackSize, 0, @@ -2020,7 +2037,8 @@ __global__ void add_fusedQKV_bias_transpose_split_kernel( const int token_num, const int head_num, const int size_per_head) { - const int32_t offset = batch_size * seq_len * head_num * size_per_head; + const auto offset(batch_size * seq_len * head_num * size_per_head); + const int32_t hidden_size = head_num * size_per_head; const int32_t fused_hidden_size = 3 * hidden_size; int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; @@ -2101,7 +2119,8 @@ void qkv_bias_add_transpose_split(const phi::GPUContext &dev_ctx, const int seq_len, const int size_per_head, bool compute_bias) { - const int32_t elem_cnt = token_num * head_num * size_per_head * 3; + const auto elem_cnt(token_num * head_num * size_per_head * 3); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ(size_per_head % PackSize, 0, @@ -2179,19 +2198,21 @@ __global__ void gqa_fusedQKV_transpose_split_kernel(T *q_buf, // [token_num, num_head or gqa_group_size, size_per_head] if (head_id < head_num) { - const int32_t write_idx = token_idx * head_num * size_per_head + - head_id * size_per_head + size_id; + const auto write_idx(token_idx * head_num * size_per_head + + head_id * size_per_head + size_id); + phi::Store(src_vec, &q_buf[write_idx]); } else { if (head_id < head_num + gqa_group_size) { - const int32_t write_idx = token_idx * gqa_group_size * size_per_head + - (head_id - head_num) * size_per_head + - size_id; + const auto write_idx(token_idx * gqa_group_size * size_per_head + + (head_id - head_num) * size_per_head + size_id); + phi::Store(src_vec, &k_buf[write_idx]); } else { - const int32_t write_idx = + const auto write_idx( token_idx * gqa_group_size * size_per_head + - (head_id - head_num - gqa_group_size) * size_per_head + size_id; + (head_id - head_num - gqa_group_size) * size_per_head + size_id); + phi::Store(src_vec, &v_buf[write_idx]); } } @@ -2212,8 +2233,9 @@ void gqa_qkv_transpose_split(const phi::GPUContext &dev_ctx, const int seq_len, const int size_per_head, const int gqa_group_size) { - const int32_t elem_cnt = - token_num * (head_num + 2 * gqa_group_size) * size_per_head; + const auto elem_cnt(token_num * (head_num + 2 * gqa_group_size) * + size_per_head); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ(size_per_head % PackSize, 0, @@ -2259,12 +2281,13 @@ __global__ void NeoXRotaryKernel(const T *input, if (sequence_lengths && si >= sequence_lengths[bi] * rotary_emb_dims) return; int half_lastdim = last_dim / 2; for (int ti = threadIdx.x; ti < half_lastdim; ti += blockDim.x) { - int base_idx = bi * head_num * seq_len * last_dim + - hi * seq_len * last_dim + si * last_dim; + auto base_idx = bi * head_num * seq_len * last_dim + + hi * seq_len * last_dim + si * last_dim; int left_idx = base_idx + ti; - const int right_idx = base_idx + ti + half_lastdim; - int emb_idx_left = bi * seq_len * last_dim + si * last_dim + ti; - int emb_idx_right = + const auto right_idx(base_idx + ti + half_lastdim); + + auto emb_idx_left = bi * seq_len * last_dim + si * last_dim + ti; + auto emb_idx_right = bi * seq_len * last_dim + si * last_dim + ti + half_lastdim; float input_left = static_cast(input[left_idx]); float input_right = static_cast(input[right_idx]); @@ -2302,11 +2325,12 @@ __global__ void RotaryKernel(const T *input, // Note(ZhenyuLi): Calculate the relevant data at one time, so that no // additional space is required. for (int ti = threadIdx.x; ti < half_lastdim; ti += blockDim.x) { - int base_idx = bi * head_num * seq_len * last_dim + - hi * seq_len * last_dim + si * last_dim; - int left_idx = base_idx + 2 * ti; - const int right_idx = base_idx + 2 * ti + 1; - int emb_idx = bi * seq_len * last_dim + si * last_dim + 2 * ti; + auto base_idx = bi * head_num * seq_len * last_dim + + hi * seq_len * last_dim + si * last_dim; + auto left_idx = base_idx + 2 * ti; + const auto right_idx(base_idx + 2 * ti + 1); + + auto emb_idx = bi * seq_len * last_dim + si * last_dim + 2 * ti; float input_left = static_cast(input[left_idx]); float input_right = static_cast(input[right_idx]); float cos_tmp = cos_emb[emb_idx]; @@ -2659,7 +2683,7 @@ __global__ void BiasAct(const T *bias, i += gridDim.x * blockDim.x * VecSize) { int row_idx = i / cols; int col_idx = i % cols; - int linear_idx = row_idx * cols + col_idx; + auto linear_idx = row_idx * cols + col_idx; // phi::Load(&input[linear_idx], &src_vec); load_func.template load(&src_vec, linear_idx); if (bias) { @@ -2730,8 +2754,8 @@ __global__ void fused_transpose_split_kernel( const int token_num, const int head_num, const int size_per_head) { - const int32_t offset = - batch_size * max_len_this_time * head_num * size_per_head; + const auto offset(batch_size * max_len_this_time * head_num * size_per_head); + const int32_t hidden_size = head_num * size_per_head; const int32_t fused_hidden_size = 3 * hidden_size; int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; @@ -2776,7 +2800,7 @@ __global__ void fused_transpose_split_kernel( seq_id * size_per_head + size_id], &src_vec); } - int32_t write_index = + auto write_index = linear_index - (qkv_id + 2 * current_token) * hidden_size; if (qkv_id == 0) { phi::Store(src_vec, &q_out[write_index]); @@ -2803,7 +2827,8 @@ void TransposeSplit(const phi::GPUContext &dev_ctx, const int max_len_this_time, const int seq_len, const int size_per_head) { - const int32_t elem_cnt = token_num * head_num * size_per_head * 3; + const auto elem_cnt(token_num * head_num * size_per_head * 3); + constexpr int PackSize = VEC_16B / sizeof(T); PADDLE_ENFORCE_EQ(size_per_head % PackSize, 0, @@ -2953,7 +2978,8 @@ void rotary_qk_variable( const int input_output_len, const int dim_head, const int rope_bsz) { - const int elem_nums = token_num * 3 * head_num * dim_head; // just q and k + const auto elem_nums(token_num * 3 * head_num * dim_head); + // just q and k constexpr int PackSize = 16 / sizeof(T); const int pack_num = elem_nums / PackSize; const int blocksize = 128; @@ -3067,8 +3093,8 @@ void gqa_rotary_qk_variable( const int dim_head, const int gqa_group_size, const int rope_bsz) { - const int elem_nums = - token_num * (head_num + 2 * gqa_group_size) * dim_head; // for all q k v + const auto elem_nums(token_num * (head_num + 2 * gqa_group_size) * dim_head); + // for all q k v constexpr int PackSize = 16 / sizeof(T); const int pack_num = elem_nums / PackSize; const int blocksize = 128; diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu index a7cd7aebb92c7f..332cb1364ba46d 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu @@ -264,8 +264,8 @@ void FusedSeqpoolCVMGradCUDAKernel( } } - int cur_batch_size = in_grad->lod().size() ? in_grad->lod()[0].size() - 1 - : in_grad->dims()[0]; + auto cur_batch_size = in_grad->lod().size() ? in_grad->lod()[0].size() - 1 + : in_grad->dims()[0]; if (batch_size == -1) { batch_size = cur_batch_size; } else { diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu index 65b96dc22d8357..345a2af55bd01f 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu @@ -298,7 +298,7 @@ void FusedSeqpoolCVMCUDAKernel(const Context &dev_ctx, lods.push_back(i + 1); } } - int cur_batch_size = + auto cur_batch_size = input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0]; if (batch_size == -1) { batch_size = cur_batch_size; diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu index 1a17ede68774c1..e31a11070fb4db 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu @@ -65,7 +65,7 @@ __global__ void SoftmaxMaskFuseGradGPUKernel(const T* grad_input, #pragma unroll for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { - int data_index = kOneLoadingCounts * local_idx + ii * WARP_SIZE; + auto data_index = kOneLoadingCounts * local_idx + ii * WARP_SIZE; if (data_index < batch_data) { load_data(temp_grad_input, grad_input + i * key_seq_len + ii * warp_size); @@ -103,7 +103,7 @@ __global__ void SoftmaxMaskFuseGradGPUKernel(const T* grad_input, if (i >= local_batches) break; #pragma unroll for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { - int data_index = kOneLoadingCounts * local_idx + ii * warp_size; + auto data_index = kOneLoadingCounts * local_idx + ii * warp_size; if (data_index < key_seq_len) { // compute gradients T samples_out[kOneLoadingCounts]; @@ -149,7 +149,7 @@ void FusedSoftmaxMaskGradKernel(const Context& dev_ctx, // use 128 threads per block to maximum gpu utilization constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / warp_size); + auto warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; int64_t blocks = batch_count / batches_per_block; dim3 threads(warp_size, warps_per_block, 1); diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu index dcedf010bad4b6..490fde17889cce 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu @@ -72,8 +72,9 @@ __global__ void SoftmaxMaskFuseV1GPUKernel(const T* x_data, // might be many batches per warp. compute the index within the batch int local_idx = threadIdx.x; - int x_offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx; - int mask_offset = mask_fist_idx * key_seq_len + kOneLoadingCounts * local_idx; + auto x_offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx; + auto mask_offset = + mask_fist_idx * key_seq_len + kOneLoadingCounts * local_idx; x_data += x_offset; mask_data += mask_offset; y_data += x_offset; @@ -89,10 +90,10 @@ __global__ void SoftmaxMaskFuseV1GPUKernel(const T* x_data, #pragma unroll for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { - int data_index = kOneLoadingCounts * local_idx + ii * warp_size; + auto data_index = kOneLoadingCounts * local_idx + ii * warp_size; if (data_index < batch_data) { - int itr_idx = i * key_seq_len + ii * warp_size; + auto itr_idx = i * key_seq_len + ii * warp_size; // efficiently load data from global memory load_data(temp_data, x_data + itr_idx); @@ -148,7 +149,7 @@ __global__ void SoftmaxMaskFuseV1GPUKernel(const T* x_data, if (i >= local_batches) break; #pragma unroll for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { - int idx = kOneLoadingCounts * local_idx + ii * warp_size; + auto idx = kOneLoadingCounts * local_idx + ii * warp_size; if (idx < key_seq_len) { #pragma unroll for (int counter = 0; counter < kOneLoadingCounts; ++counter) { @@ -234,10 +235,10 @@ __global__ void SoftmaxMaskFuseV2GPUKernel(const T* x_data, #pragma unroll for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { - int data_index = kOneLoadingCounts * local_idx + ii * warp_size; + auto data_index = kOneLoadingCounts * local_idx + ii * warp_size; if (data_index < batch_data) { - int itr_idx = i * key_seq_len + ii * warp_size; + auto itr_idx = i * key_seq_len + ii * warp_size; // efficiently load data from global memory load_data(temp_data, x_data + itr_idx); @@ -293,7 +294,7 @@ __global__ void SoftmaxMaskFuseV2GPUKernel(const T* x_data, if (i >= local_batches) break; #pragma unroll for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { - int idx = kOneLoadingCounts * local_idx + ii * warp_size; + auto idx = kOneLoadingCounts * local_idx + ii * warp_size; if (idx < key_seq_len) { #pragma unroll for (int counter = 0; counter < kOneLoadingCounts; ++counter) { @@ -538,7 +539,7 @@ void FusedSoftmaxMaskKernel(const Context& dev_ctx, // use 128 threads per block to maximum gpu utilization constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / warp_size); + auto warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; PADDLE_ENFORCE_EQ( query_seq_len % batches_per_block, diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu index ddf59e49be0ad5..349ae52a714394 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu @@ -178,7 +178,7 @@ void FusedSoftmaxMaskFuseUpperTriangleGradKernel(const Context& dev_ctx, // use 128 threads per block to maximum gpu utilization constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / warp_size); + auto warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; // if we use dim3 blocks(query_seq_len, // (attn_mul_batch + batches_per_block) / batches_per_block, diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu index 0a5b7ef202a2de..66babfe14d6e4d 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu @@ -190,7 +190,7 @@ void FusedSoftmaxMaskFuseUpperTriangleKernel(const Context& dev_ctx, int batches_per_warp = (next_pow2 <= 128) ? 2 : 1; constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / warp_size); + auto warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; PADDLE_ENFORCE_EQ( query_seq_len % batches_per_block, diff --git a/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu index e4b0f90a8ce542..0e32fca9ec53b0 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu @@ -213,7 +213,7 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin, threadIdx.x / 128; // 0 or 1, two quant blocks per block const int in_y_idx = blockIdx.y; const int in_x_idx = blockIdx.x * blockDim.x + x_offset; - const int src_idx = in_y_idx * cols + in_x_idx; + const auto src_idx(in_y_idx * cols + in_x_idx); // Load data and compute swiGLU activation if (in_x_idx < cols / 2) [[likely]] { // NOLINT diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu index acb3b83bc983f3..e1afd708ce079f 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu @@ -126,16 +126,17 @@ __global__ void masked_multihead_attention_kernel( // real batch id const int bbi = bi / params.beam_width; const int hi = blockIdx.y; - const int bhi = bi * params.num_head + hi; + const auto bhi(bi * params.num_head + hi); const int kv_num_head = params.kv_num_head; const int num_head_per_group = params.num_head / kv_num_head; // hi means the head index in query processed by this cuda thread. // kv_bhi means the merged batch and head index in key and value processed by // this cuda thread. - const int kv_bhi = bi * kv_num_head + hi / num_head_per_group; + const auto kv_bhi(bi * kv_num_head + hi / num_head_per_group); + + const auto bbhi(bbi * params.beam_width * params.num_head + hi); - const int bbhi = bbi * params.beam_width * params.num_head + hi; const int tid = threadIdx.x; const int bi_seq_len_offset = bi * params.max_seq_length; @@ -153,7 +154,7 @@ __global__ void masked_multihead_attention_kernel( int start_seq = 0; int end_seq = act_time_step; bool is_last_block = (SPLIT == false); - int real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1; + auto real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1; if constexpr (SPLIT) { if (split_index >= real_split_each_batch) return; @@ -168,7 +169,8 @@ __global__ void masked_multihead_attention_kernel( // qkv [B, S=1, num_head + 2 * kv_num_head, head_dim] // this hi means the head index in query! - int qkv_base_offset = bi * (params.num_head + 2 * kv_num_head) * Dh + hi * Dh; + auto qkv_base_offset = + bi * (params.num_head + 2 * kv_num_head) * Dh + hi * Dh; // QK_VEC_SIZE is only used for compute q dot k . constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); @@ -198,9 +200,9 @@ __global__ void masked_multihead_attention_kernel( // k has QK_VECS_PER_WARP elements: [Qk_vec, Qk_vec, ..., Qk_vec] // per cuda thread read a Qk_vec of q and k and compute q dot k. if (tid < QK_VECS_PER_WARP) { - int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; - int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; - int k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE; + auto qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; + auto q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; + auto k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE; Qk_vec q; zero(q); @@ -246,7 +248,7 @@ __global__ void masked_multihead_attention_kernel( if (!params.neox_rotary_style) { if (params.rotary_emb_dims != 0) { - int rotary_offset = bi * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.batch_size * Dh; Qk_vec_RoPE cos_emb, sin_emb; @@ -267,16 +269,16 @@ __global__ void masked_multihead_attention_kernel( if (params.rotary_emb_dims != 0) { int last_dim = Dh / params.rotary_emb_dims; int half_lastdim = last_dim / 2; - int rotary_offset = bi * Dh + tid * QK_VEC_SIZE; + auto rotary_offset = bi * Dh + tid * QK_VEC_SIZE; const float *cos_base = params.rotary_emb; const float *sin_base = params.rotary_emb + params.batch_size * Dh; int stride = half_lastdim / QK_VEC_SIZE; int stride_all_lastdim = 2 * stride; - int right_id = tid / stride_all_lastdim * stride_all_lastdim + - (tid + stride) % (stride_all_lastdim); - int qk_right_offset = qkv_base_offset + right_id * QK_VEC_SIZE; - int q_right_bias_offset = hi * Dh + right_id * QK_VEC_SIZE; - int k_right_bias_offset = + auto right_id = tid / stride_all_lastdim * stride_all_lastdim + + (tid + stride) % (stride_all_lastdim); + auto qk_right_offset = qkv_base_offset + right_id * QK_VEC_SIZE; + auto q_right_bias_offset = hi * Dh + right_id * QK_VEC_SIZE; + auto k_right_bias_offset = hi / num_head_per_group * Dh + right_id * QK_VEC_SIZE; Qk_vec q_right; zero(q_right); @@ -346,9 +348,9 @@ __global__ void masked_multihead_attention_kernel( if (is_last_block) { int co = tid / QK_VECS_IN_16B; int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE; - int offset = kv_bhi * params.max_seq_length * Dh + - co * params.max_seq_length * QK_ELTS_IN_16B + - act_time_step * QK_ELTS_IN_16B + ci; + auto offset = kv_bhi * params.max_seq_length * Dh + + co * params.max_seq_length * QK_ELTS_IN_16B + + act_time_step * QK_ELTS_IN_16B + ci; if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) { *reinterpret_cast(¶ms.cache_kv[offset]) = k; } @@ -395,7 +397,7 @@ __global__ void masked_multihead_attention_kernel( constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY; constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - int ko = tid / THREADS_PER_KEY + start_seq; + auto ko = tid / THREADS_PER_KEY + start_seq; int ki = (tid % THREADS_PER_KEY) * K_VEC_SIZE; static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD, ""); @@ -412,7 +414,7 @@ __global__ void masked_multihead_attention_kernel( T *k_cache = ¶ms.cache_kv[kv_bhi * params.max_seq_length * Dh + ki]; T *k_cache_batch = ¶ms.cache_kv[bbhi * params.max_seq_length * Dh + ki]; - int ti_end = div_up(curr_seq_section, K_PER_WARP) * K_PER_WARP + start_seq; + auto ti_end = div_up(curr_seq_section, K_PER_WARP) * K_PER_WARP + start_seq; const int *beam_offsets = params.beam_cache_offset ? ¶ms.beam_cache_offset[bi_seq_len_offset] @@ -420,15 +422,16 @@ __global__ void masked_multihead_attention_kernel( #pragma unroll for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - const int beam_offset = beam_offsets ? beam_offsets[ti] * params.num_head * - params.max_seq_length * Dh - : 0; + const auto beam_offset(beam_offsets ? beam_offsets[ti] * params.num_head * + params.max_seq_length * Dh + : 0); + K_vec k[K_VECS_PER_THREAD]; K_vec k_vec_zero; zero(k_vec_zero); #pragma unroll for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * params.max_seq_length + ti; + auto jj = ii * params.max_seq_length + ti; if (ti < end_seq) { if (beam_offset) { k[ii] = @@ -487,7 +490,7 @@ __global__ void masked_multihead_attention_kernel( qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - int useful_smem_index = + auto useful_smem_index = is_last_block ? curr_seq_section : curr_seq_section - 1; float sum = 0.f; for (int ti = tid; ti <= useful_smem_index; ti += THREADS_PER_BLOCK) { @@ -527,7 +530,7 @@ __global__ void masked_multihead_attention_kernel( // vi means the head_dim index processed by this cuda thread in the value. // so this cuda thread compute [1, k] * [k, vi:vi+V_VEC_SIZE] and k starts // from vo and increases by a step V_PER_ITER. - int vo = tid / THREADS_PER_VALUE + start_seq; + auto vo = tid / THREADS_PER_VALUE + start_seq; int vi = (tid % THREADS_PER_VALUE) * V_VEC_SIZE; T *v_cache = ¶ms.cache_kv[params.cache_batch_size * kv_num_head * @@ -550,10 +553,10 @@ __global__ void masked_multihead_attention_kernel( if (Dh == Dh_MAX || vi < Dh) { #pragma unroll for (int ti = vo; ti < end_seq; ti += V_PER_ITER) { - const int beam_offset = - beam_offsets - ? beam_offsets[ti] * params.num_head * params.max_seq_length * Dh - : 0; + const auto beam_offset(beam_offsets ? beam_offsets[ti] * params.num_head * + params.max_seq_length * Dh + : 0); + V_vec v; if (beam_offset) { v = *reinterpret_cast( @@ -662,14 +665,15 @@ __global__ void post_process_kernel(Masked_multihead_attention_params params, int act_time_step = params.sequence_lengths == nullptr ? params.timestep : params.sequence_lengths[bi]; - int real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1; + auto real_split_each_batch = (act_time_step - 1) / params.steps_per_block + 1; if (real_split_each_batch <= 1) { return; } const int tid = threadIdx.x; const int hi = blockIdx.x; - const int bhi = (bi * params.num_head + hi); + const auto bhi((bi * params.num_head + hi)); + const int bhsi = (bi * params.num_head + hi) * params.split_seq; extern __shared__ float2 qk_sum_max_smem[]; @@ -1018,7 +1022,7 @@ void DispatchWithDtype(const Context &dev_ctx, int k_num_head = cache_kv.dims()[2]; int v_num_head = k_num_head; // this num_head means query's head - int num_head = + auto num_head = x.dims()[x.dims().size() - 1] / dim_head - k_num_head - v_num_head; Masked_multihead_attention_params params; diff --git a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu index 393128051b561a..ba8b6a169ac96f 100644 --- a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu @@ -105,9 +105,11 @@ __global__ void TransposeQkvKernel(const int H, const int NH = N * H; const int NHS = NH * S; - const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3; - const int bias_offset = m * NH + n * H; - const int out_offset = s * H + n * S * H + b * NHS + m * NHS * B; + const auto in_offset(n * H + m * NH + s * 3 * NH + b * NHS * 3); + + const auto bias_offset(m * NH + n * H); + + const auto out_offset(s * H + n * S * H + b * NHS + m * NHS * B); const int i = threadIdx.x; output[out_offset + i] = @@ -134,7 +136,7 @@ void TransQKVWithBias(const int batch, float *output, gpuStream_t stream) { // BxSx3xNxH + 3xNxH -> 3xBxNxSxH - int scratch_size = batch * head_num * seq_len * seq_len; + auto scratch_size = batch * head_num * seq_len * seq_len; const dim3 grid(seq_len, batch, 3); // scratch % 4 == 0 to ensure the alignment if (head_size % 4 == 0 && scratch_size % 4 == 0) { @@ -196,7 +198,7 @@ void TransQKVWithBias(const int batch, phi::float16 *output, gpuStream_t stream) { // BxSx3xNxH + 3xNxH -> 3xBxNxSxH - int scratch_size = batch * head_num * seq_len * seq_len; + auto scratch_size = batch * head_num * seq_len * seq_len; const dim3 grid(seq_len, batch, 3); if (head_size % 2 == 0 && scratch_size % 2 == 0) { const int h = head_size / 2; @@ -302,7 +304,7 @@ void MultiheadMatmulKernel(const Context &dev_ctx, temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); auto *temp_qk_bias = dev_ctx.template Alloc( &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); - int grid = batch * head_number * seq_len; + auto grid = batch * head_number * seq_len; int block = round_up(seq_len); broadcast<<>>( bias_qk_d, temp_qk_bias, seq_len, head_number); @@ -315,14 +317,14 @@ void MultiheadMatmulKernel(const Context &dev_ctx, temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); auto *temp_qk_bias = dev_ctx.template Alloc( &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); - int grid = batch * head_number * seq_len; + auto grid = batch * head_number * seq_len; int block = round_up(seq_len); broadcast_batch_head_number<<>>( bias_qk_d, temp_qk_bias, batch, seq_len, head_number); bias_qk_d = static_cast(temp_qk_bias); } if (!bias_qk) { - int size = batch * head_number * seq_len * seq_len; + auto size = batch * head_number * seq_len * seq_len; temp_bias_tensor.Resize({size}); auto *temp_qk_bias = dev_ctx.template Alloc( &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); @@ -362,7 +364,7 @@ void MultiheadMatmulKernel(const Context &dev_ctx, phi::DenseTensor multihead_temp_tensor; // B * head_number * S * S * 1 + B * S * 3 * N * H - int scratch_size = batch * head_number * seq_len * seq_len * 1; + auto scratch_size = batch * head_number * seq_len * seq_len * 1; multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()}); auto *multihead_temp_data = dev_ctx.template Alloc( &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T)); @@ -408,7 +410,7 @@ void MultiheadMatmulKernel(const Context &dev_ctx, T(0.0)); } - int grid = batch * head_number * seq_len; + auto grid = batch * head_number * seq_len; int block = head_size; transpose<<>>( tptr, output_d, batch, seq_len, head_number, head_size); diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu index b2d15a59f8b1c9..1700cadaf95ecc 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -92,14 +92,14 @@ __global__ void qkv_attention_kernel(QkvUnpackMhaParams params, // real batch id const int bbi = bi / params.beam_width; const int hi = blockIdx.x; - const int bhi = bi * params.num_head + hi; + const auto bhi(bi * params.num_head + hi); const int kv_num_head = params.kv_num_head; const int num_head_per_group = params.num_head / kv_num_head; - const int kv_bhi = bi * kv_num_head + hi / num_head_per_group; + const auto kv_bhi(bi * kv_num_head + hi / num_head_per_group); - const int bbhi = bbi * params.beam_width * params.num_head + hi; + const auto bbhi(bbi * params.beam_width * params.num_head + hi); const int tid = threadIdx.x; @@ -108,7 +108,7 @@ __global__ void qkv_attention_kernel(QkvUnpackMhaParams params, int act_time_step = params.timestep; - int qkv_base_offset = bi * (params.num_head) * Dh + hi * Dh; + auto qkv_base_offset = bi * (params.num_head) * Dh + hi * Dh; constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); static_assert(Dh_MAX % QK_VEC_SIZE == 0, ""); @@ -120,9 +120,9 @@ __global__ void qkv_attention_kernel(QkvUnpackMhaParams params, // load q element to q smem if (tid < QK_VECS_PER_WARP) { - int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; - int q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; - int k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE; + auto qk_offset = qkv_base_offset + tid * QK_VEC_SIZE; + auto q_bias_offset = hi * Dh + tid * QK_VEC_SIZE; + auto k_bias_offset = hi / num_head_per_group * Dh + tid * QK_VEC_SIZE; Qk_vec q; zero(q); diff --git a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc index d9dee204f7fc38..ca50fe63fc33aa 100644 --- a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc +++ b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc @@ -73,11 +73,11 @@ class GRUOneDNNHandler // Is it int8 kernel const bool is_INT8 = std::is_same::value; if (is_INT8) { - const int weights_scale_mask = + const auto weights_scale_mask( 0 + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` - + - (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` + + (1 << 4)); + // bit, indicating the unique scales for `o` dim in `ldigo` attr_.set_rnn_data_qparams(scale_data, shift_data); attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights); diff --git a/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h b/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h index 95e8900cc439c6..71ec1e859ee46a 100644 --- a/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h +++ b/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h @@ -65,11 +65,11 @@ class RNNONEDNNHandler : public phi::funcs::OneDNNHandlerT { if (is_INT8) { // Int8 attributes - const int weights_scale_mask = + const auto weights_scale_mask( 0 + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` - + - (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` + + (1 << 4)); + // bit, indicating the unique scales for `o` dim in `ldigo` attr_.set_rnn_data_qparams(scale_data, shift_data); attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights); diff --git a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc old mode 100755 new mode 100644 index 1a21b7a1b3e562..3bd0a9ebc95a78 --- a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc @@ -69,7 +69,7 @@ void qkv_split_rope_kernel( auto k_data = reinterpret_cast(k_out->data()); auto v_data = reinterpret_cast(v_out->data()); auto qkv_input_data = reinterpret_cast(qkv_input.data()); - int qkv_head = q_num_head + 2 * kv_num_head; + auto qkv_head = q_num_head + 2 * kv_num_head; int32_t ret; ret = baidu::xpu::api::split(xpu_ctx.x_context(), qkv_input_data, @@ -195,7 +195,8 @@ void BlockMultiheadAttentionXPUKernel( const int kv_num_head = key_cache_dims[1]; const int dim_head = key_cache_dims[3]; const int total_num_head = qkv.dims()[qkv.dims().size() - 1] / dim_head; - const int q_num_head = total_num_head - 2 * kv_num_head; + const auto q_num_head(total_num_head - 2 * kv_num_head); + const int bsz = cum_offsets.dims()[0]; const int max_block_per_seq = block_tables.dims()[1]; const int out_row = fmha_out->dims()[0]; diff --git a/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc index cfbdffb3473f31..650153663d06ac 100644 --- a/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/embedding_with_eltwise_add_xpu_kernel.cc @@ -28,7 +28,7 @@ void FillSeqLod(int batch_size, for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { int cur_batch_seq_len = 0; for (int seq_idx = 0; seq_idx < max_seq_len; seq_idx++) { - int mask_idx = batch_idx * max_seq_len + seq_idx; + auto mask_idx = batch_idx * max_seq_len + seq_idx; if (mask[mask_idx] > 0) { cur_batch_seq_len++; } else { @@ -47,7 +47,7 @@ void FillSeqLod(int batch_size, for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { int cur_batch_seq_len = 0; for (int seq_idx = 0; seq_idx < max_seq_len; seq_idx++) { - int mask_idx = batch_idx * max_seq_len + seq_idx; + auto mask_idx = batch_idx * max_seq_len + seq_idx; if (mask[mask_idx] > 1e-7) { cur_batch_seq_len++; } else { diff --git a/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu index 6fdcebde8e6d94..b83b0398380af0 100644 --- a/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu @@ -65,9 +65,10 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(const T* dy, T ds_sum = 0; T db_sum = 0; for (int64_t j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; + const auto index(layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i); + ds_sum += dy[index] * x[index]; db_sum += dy[index]; } diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 4dbbcb814cee21..74a29561a543b6 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -74,8 +74,8 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx, // reduce_dims = [3] // reduce along the broadcasted axis std::vector reduce_dims_vec; for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; + auto out_axis = out_rank - j - 1; + auto in_axis = in_rank - j - 1; if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { reduce_dims_vec.push_back(in_axis); diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu index cc8fdbb57ff5a7..9a9a0be0917338 100644 --- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu @@ -416,7 +416,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, size_t cub_temp_storage_bytes = std::max(std::max(cub_sort_temp_store_size, cub_scan_temp_store_size), cub_sum_temp_store_size); - int num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1; + auto num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1; PADDLE_ENFORCE_GT( (4 * num_buffer_ele + 3 * (nranks + 1) + num_temp_ele), 0, diff --git a/paddle/phi/kernels/gpu/correlation_grad_kernel.cu b/paddle/phi/kernels/gpu/correlation_grad_kernel.cu index 2a1f277d8e77f9..c4eea6654cccb7 100644 --- a/paddle/phi/kernels/gpu/correlation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/correlation_grad_kernel.cu @@ -47,7 +47,7 @@ __global__ void correlation_backward_input1(int64_t n, int kernel_rad = (kernel_size - 1) / 2; int displacement_rad = max_displacement / stride2; - int displacement_size = 2 * displacement_rad + 1; + auto displacement_size = 2 * displacement_rad + 1; int64_t xmin = (w - kernel_rad - max_displacement) / stride1; int64_t ymin = (h - kernel_rad - max_displacement) / stride1; @@ -128,7 +128,7 @@ __global__ void correlation_backward_input2(int64_t n, int kernel_rad = (kernel_size - 1) / 2; int displacement_rad = max_displacement / stride2; - int displacement_size = 2 * displacement_rad + 1; + auto displacement_size = 2 * displacement_rad + 1; int64_t p_input_width = input_width + 2 * pad_size; int64_t p_input_height = input_height + 2 * pad_size; @@ -208,8 +208,8 @@ void CorrelationCUDAGradKernel(const Context &dev_ctx, int H = in_dims[2]; int W = in_dims[3]; - int padded_input_height = H + 2 * pad_size; - int padded_input_width = W + 2 * pad_size; + auto padded_input_height = H + 2 * pad_size; + auto padded_input_width = W + 2 * pad_size; phi::DenseTensor rinput1; rinput1.Resize({N, padded_input_height, padded_input_width, C}); diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu index aa0aeb62683164..93500e60d8f72d 100644 --- a/paddle/phi/kernels/gpu/correlation_kernel.cu +++ b/paddle/phi/kernels/gpu/correlation_kernel.cu @@ -41,7 +41,7 @@ __global__ void correlation_forward(T *output, int kernel_rad = (kernel_size - 1) / 2; int displacement_rad = max_displacement / stride2; - int displacement_size = 2 * displacement_rad + 1; + auto displacement_size = 2 * displacement_rad + 1; int64_t global_block_id = blockIdx.x; int64_t hw = (int64_t)OH * OW; @@ -130,8 +130,8 @@ void CorrelationCUDAKernel(const Context &dev_ctx, int H = in_dims[2]; int W = in_dims[3]; - int padded_input_height = H + 2 * pad_size; - int padded_input_width = W + 2 * pad_size; + auto padded_input_height = H + 2 * pad_size; + auto padded_input_width = W + 2 * pad_size; phi::DenseTensor rinput1; rinput1.Resize({N, padded_input_height, padded_input_width, C}); diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index af56951ebcf48a..a96e1afd4f4dae 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -47,7 +47,7 @@ __global__ void SoftLabelCrossEntropyGradientKernel(T* logit_grad, if (ids < static_cast(n) * d) { int idx_n = ids / d; int idx_remain = ids % remain; - int idx_loss = idx_n * remain + idx_remain; + auto idx_loss = idx_n * remain + idx_remain; logit_grad[ids] = loss_grad[idx_loss] * (-labels[ids] / logit_grad[ids]); } } @@ -63,7 +63,7 @@ __global__ void HardLabelCrossEntropyGradientKernel(T* logit_grad, int idx_n = index / remain; int idx_remain = index % remain; int tmp = static_cast(labels[index]); - int idx = idx_n * d + tmp * remain + idx_remain; + auto idx = idx_n * d + tmp * remain + idx_remain; if (ignore_index != tmp) { logit_grad[idx] = -static_cast(1.) / logit_grad[idx]; } @@ -81,7 +81,7 @@ __global__ void ScaleCrossEntropyGradient(T* logit_grad, CUDA_KERNEL_LOOP(index, num) { int idx_n = index / d; int idx_remain = index % remain; - int idx_lbl = idx_n * remain + idx_remain; + auto idx_lbl = idx_n * remain + idx_remain; int k = (index % d) / remain; auto lbl = static_cast(labels[idx_lbl]); if (lbl == ignore_index || lbl != k) { diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index be2c296a2ff046..7b43d631f57af5 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -755,7 +755,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, // use 128 threads per block to maximimize gpu utilization constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / kWarpSize); + auto warps_per_block = (threads_per_block / kWarpSize); int batches_per_block = warps_per_block * batches_per_warp; int64_t blocks = (static_cast(N) + batches_per_block - 1) / batches_per_block; @@ -1099,7 +1099,7 @@ void SwitchWarpSoftmaxForward(T* loss, int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; int batches_per_warp = (kDimCeil <= 128) ? 2 : 1; constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / kWarpSize); + auto warps_per_block = (threads_per_block / kWarpSize); int batches_per_block = warps_per_block * batches_per_warp; int64_t blocks = (static_cast(batch_size) + batches_per_block - 1) / batches_per_block; diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index 2edac5eba5d9ef..97c55e3fc436e5 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -228,12 +228,15 @@ __device__ __inline__ void KernelDepthwiseConvNCHW( return; int tmp_1 = idx / output_width; - const int w_out = idx - tmp_1 * output_width; + const auto w_out(idx - tmp_1 * output_width); + int tmp_2 = tmp_1 / output_height; - const int h_out = tmp_1 - tmp_2 * output_height; + const auto h_out(tmp_1 - tmp_2 * output_height); + tmp_1 = tmp_2; tmp_2 = tmp_1 / output_channels; - const int c_out = tmp_1 - tmp_2 * output_channels; + const auto c_out(tmp_1 - tmp_2 * output_channels); + const int batch = tmp_2; const int c_in = c_out / filter_multiplier; @@ -241,9 +244,9 @@ __device__ __inline__ void KernelDepthwiseConvNCHW( int in_offset = ((batch * input_channels + c_in) * input_height) * input_width; - int weight_offset = c_out * filter_height * filter_width; - int h_in_start = -padding_height + h_out * stride_height; - int w_in_start = -padding_width + w_out * stride_width; + auto weight_offset = c_out * filter_height * filter_width; + auto h_in_start = -padding_height + h_out * stride_height; + auto w_in_start = -padding_width + w_out * stride_width; #pragma unroll for (int fh = 0, h_in = h_in_start; fh < fh_size; @@ -252,7 +255,7 @@ __device__ __inline__ void KernelDepthwiseConvNCHW( for (int fw = 0, w_in = w_in_start; fw < fw_size; fw++, w_in += dilate_width) { if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { - int offset = in_offset + h_in * input_width + w_in; + auto offset = in_offset + h_in * input_width + w_in; T in_data = input_data[offset]; if (fuse_relu_before_conv) { value += filter_data[weight_offset] * @@ -280,20 +283,26 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( } int tmp_1 = idx / output_channels; - const int c_out = idx - tmp_1 * output_channels; + const auto c_out(idx - tmp_1 * output_channels); + int tmp_2 = tmp_1 / output_width; - const int w_out = tmp_1 - tmp_2 * output_width; + const auto w_out(tmp_1 - tmp_2 * output_width); + tmp_1 = tmp_2; tmp_2 = tmp_1 / output_height; - const int h_out = tmp_1 - tmp_2 * output_height; + const auto h_out(tmp_1 - tmp_2 * output_height); + const int batch = tmp_2; const int c_in = c_out / filter_multiplier; T value(0); - const int in_offset = - batch * input_height * input_width * input_channels + c_in; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; + const auto in_offset(batch * input_height * input_width * input_channels + + c_in); + + const auto h_in_start(-padding_height + h_out * stride_height); + + const auto w_in_start(-padding_width + w_out * stride_width); + int weight_offset = 0; #pragma unroll @@ -303,7 +312,7 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( for (int fw = 0, w_in = w_in_start; fw < fw_size; ++fw, w_in += dilate_width) { if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { - int offset = in_offset + (h_in * input_width + w_in) * input_channels; + auto offset = in_offset + (h_in * input_width + w_in) * input_channels; T in_data = input_data[offset]; const T* weight = filter_data + weight_offset * output_channels + c_out; if (fuse_relu_before_conv) { @@ -336,8 +345,9 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW( const int c_in = c_out / filter_multiplier; T value(0); - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; + const auto h_in_start(-padding_height + h_out * stride_height); + + const auto w_in_start(-padding_width + w_out * stride_width); int in_offset = ((batch * input_channels + c_in) * input_height) * input_width; @@ -348,7 +358,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW( w_in += dilate_width, w_f++) { if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { - int offset = in_offset + h_in * input_width + w_in; + auto offset = in_offset + h_in * input_width + w_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * static_cast( @@ -359,7 +369,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW( } } } - int index = + auto index = ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + w_out; output_data[index] = value; @@ -375,10 +385,11 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( if (h_out >= output_height) { return; } - int in_offset = batch * input_height * input_width * input_channels; - int out_offset = + auto in_offset = batch * input_height * input_width * input_channels; + auto out_offset = (batch * output_height + h_out) * output_width * output_channels; - const int h_in_start = -padding_height + h_out * stride_height; + const auto h_in_start(-padding_height + h_out * stride_height); + const int wi_size = (output_width + dilate_width - 1) / dilate_width; const int kWeightSize = c_filter * c_filter; T r_weight[kWeightSize]; @@ -391,20 +402,21 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( const int c_in = c_out / filter_multiplier; for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) { int i_dw = i / wi_size; - int i_wi = i - i_dw * wi_size; - int w_out = i_wi * dilate_width + i_dw; + auto i_wi = i - i_dw * wi_size; + auto w_out = i_wi * dilate_width + i_dw; if (w_out >= output_width) { continue; } T value(0); - const int w_in_start = -padding_width + w_out * stride_width; + const auto w_in_start(-padding_width + w_out * stride_width); + for (int h_in = h_in_start, h_f = 0; h_f < c_filter; h_in += dilate_height, h_f++) { for (int w_in = w_in_start, w_f = 0; w_f < c_filter; w_in += dilate_width, w_f++) { if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { - int offset = + auto offset = in_offset + (h_in * input_width + w_in) * input_channels + c_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * @@ -416,7 +428,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( } } } - int index = out_offset + w_out * output_channels + c_out; + auto index = out_offset + w_out * output_channels + c_out; output_data[index] = value; } } @@ -558,25 +570,28 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNCHW( } int tmp_1 = idx / input_width; - const int w_in = idx - tmp_1 * input_width; + const auto w_in(idx - tmp_1 * input_width); + int tmp_2 = tmp_1 / input_height; - const int h_in = tmp_1 - tmp_2 * input_height; + const auto h_in(tmp_1 - tmp_2 * input_height); + tmp_1 = tmp_2; tmp_2 = tmp_1 / input_channels; - const int c_in = tmp_1 - tmp_2 * input_channels; + const auto c_in(tmp_1 - tmp_2 * input_channels); + const int batch = tmp_2; T value(0); for (int c_mul = 0; c_mul < filter_multiplier; ++c_mul) { - int c_out = c_in * filter_multiplier + c_mul; - int filter_offset = c_out * filter_height * filter_width; + auto c_out = c_in * filter_multiplier + c_mul; + auto filter_offset = c_out * filter_height * filter_width; #pragma unroll for (int fh = 0; fh < fh_size; ++fh) { #pragma unroll for (int fw = 0; fw < fw_size; ++fw) { - int h_out = h_in + padding_height - fh * dilate_height; - int w_out = w_in + padding_width - fw * dilate_width; + auto h_out = h_in + padding_height - fh * dilate_height; + auto w_out = w_in + padding_width - fw * dilate_width; if ((h_out - h_out / stride_height * stride_height == 0) && (w_out - w_out / stride_width * stride_width == 0)) { h_out /= stride_height; @@ -584,7 +599,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNCHW( if (h_out >= 0 && h_out < output_height && w_out >= 0 && w_out < output_width) { - int output_grad_offset = + auto output_grad_offset = ((batch * output_channels + c_out) * output_height + h_out) * output_width + w_out; @@ -610,15 +625,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) { for (int w_in = threadIdx.y; w_in < input_width; w_in += blockDim.y) { - int h_out_start = + auto h_out_start = h_in - (filter_height - 1) * dilate_height + padding_height; - int w_out_start = + auto w_out_start = w_in - (filter_width - 1) * dilate_width + padding_width; T value(0); - int index = ((batch * input_height + h_in) * input_width + w_in) * - input_channels + - c_in; + auto index = ((batch * input_height + h_in) * input_width + w_in) * + input_channels + + c_in; if (fuse_relu_before_conv) { if (input_data[index] <= T(0)) { input_grad_data[index] = 0; @@ -627,7 +642,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( } for (int c_i = 0; c_i < filter_multiplier; c_i++) { - int c_out = c_in * filter_multiplier + c_i; + auto c_out = c_in * filter_multiplier + c_i; int weight_offset = filter_height * filter_width; for (int h_out = h_out_start, h_f = 0; h_f < filter_height; h_out += dilate_height, h_f++) { @@ -639,11 +654,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( if (h_out % stride_height == 0 && w_out % stride_width == 0 && s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && s_w_out < output_width) { - int output_grad_offset = + auto output_grad_offset = ((batch * output_height + s_h_out) * output_width + s_w_out) * output_channels + c_out; - int filter_offset = weight_offset * output_channels + c_out; + auto filter_offset = weight_offset * output_channels + c_out; value += output_grad_data[output_grad_offset] * filter_data[filter_offset]; } @@ -661,13 +676,14 @@ template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( ARG_DEFINE_KernelDepthwiseConvInputGrad) { - const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1; + const auto kWeightSize(c_filter * c_filter * c_filter_multiplier + 1); + T r_weight[kWeightSize]; const int batch = blockIdx.y; const int c_in = blockIdx.x; for (int c_i = 0; c_i < filter_multiplier; c_i++) { - int c_out = c_in * filter_multiplier + c_i; + auto c_out = c_in * filter_multiplier + c_i; const T* weight = filter_data + c_out * c_filter * c_filter; for (int i = 0; i < c_filter * c_filter; i++) r_weight[i + c_i * c_filter * c_filter] = @@ -676,11 +692,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { - int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height; - int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; + auto h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height; + auto w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; T value(0); - int index = + auto index = ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + w_in; if (fuse_relu_before_conv) { @@ -691,7 +707,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( } for (int c_i = 0; c_i < filter_multiplier; c_i++) { - int c_out = c_in * filter_multiplier + c_i; + auto c_out = c_in * filter_multiplier + c_i; for (int h_out = h_out_start, h_f = 0; h_f < c_filter; h_out += dilate_height, h_f++) { for (int w_out = w_out_start, w_f = 0; w_f < c_filter; @@ -701,7 +717,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( if (h_out % stride_height == 0 && w_out % stride_width == 0 && s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && s_w_out < output_width) { - int output_grad_offset = + auto output_grad_offset = ((batch * output_channels + c_out) * output_height + s_h_out) * output_width + @@ -728,16 +744,17 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( if (h_in >= input_height) { return; } - const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1; + const auto kWeightSize(c_filter * c_filter * c_filter_multiplier + 1); + T r_weight[kWeightSize]; const int batch = blockIdx.z; const int wi_size = (input_width + dilate_width - 1) / dilate_width; - const int h_out_start = - h_in - (c_filter - 1) * dilate_height + padding_height; + const auto h_out_start(h_in - (c_filter - 1) * dilate_height + + padding_height); for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) { for (int c_i = 0; c_i < c_filter_multiplier; c_i++) { - int c_out = c_in * c_filter_multiplier + c_i; + auto c_out = c_in * c_filter_multiplier + c_i; for (int i = 0; i < c_filter * c_filter; i++) r_weight[i + c_i * c_filter * c_filter] = filter_data[(c_filter * c_filter - i - 1) * output_channels + @@ -745,17 +762,17 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( } for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) { int i_dw = i / wi_size; - int i_wi = i - i_dw * wi_size; - int w_in = i_wi * dilate_width + i_dw; + auto i_wi = i - i_dw * wi_size; + auto w_in = i_wi * dilate_width + i_dw; if (w_in >= input_width) { continue; } - int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; + auto w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; T value(0); - int index = ((batch * input_height + h_in) * input_width + w_in) * - input_channels + - c_in; + auto index = ((batch * input_height + h_in) * input_width + w_in) * + input_channels + + c_in; if (fuse_relu_before_conv) { if (input_data[index] <= T(0)) { input_grad_data[index] = 0; @@ -764,7 +781,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( } for (int c_i = 0; c_i < c_filter_multiplier; c_i++) { - int c_out = c_in * c_filter_multiplier + c_i; + auto c_out = c_in * c_filter_multiplier + c_i; for (int h_out = h_out_start, h_f = 0; h_f < c_filter; h_out += dilate_height, h_f++) { for (int w_out = w_out_start, w_f = 0; w_f < c_filter; @@ -774,7 +791,7 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( if (h_out % stride_height == 0 && w_out % stride_width == 0 && s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && s_w_out < output_width) { - int output_grad_offset = + auto output_grad_offset = ((batch * output_height + s_h_out) * output_width + s_w_out) * output_channels + c_out; @@ -941,22 +958,23 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( const int ohw = output_height * output_width; const int onhw = num * ohw; - const int h_offset = kh_id * dilate_height - padding_height; - const int w_offset = kw_id * dilate_width - padding_width; + const auto h_offset(kh_id * dilate_height - padding_height); + + const auto w_offset(kw_id * dilate_width - padding_width); if (loop_batch) { for (int og_w = threadIdx.x; og_w < output_width; og_w += blockDim.x) { for (int bid = 0; bid < num; ++bid) { for (int og_h = threadIdx.y; og_h < output_height; og_h += blockDim.y) { - int i_h = og_h * stride_height + h_offset; - int i_w = og_w * stride_width + w_offset; + auto i_h = og_h * stride_height + h_offset; + auto i_w = og_w * stride_width + w_offset; if (i_w >= 0 && i_w < input_width && i_h >= 0 && i_h < input_height) { - int input_offset = + auto input_offset = ((bid * input_channels + ic_id) * input_height + i_h) * input_width + i_w; - int output_grad_offset = + auto output_grad_offset = ((bid * output_channels + oc_id) * output_height + og_h) * output_width + og_w; @@ -976,19 +994,19 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( } else { for (int id = threadIdx.x; id < onhw; id += blockDim.x) { int bid = id / ohw; - int og_hw = id - bid * ohw; + auto og_hw = id - bid * ohw; int og_h = og_hw / output_width; - int og_w = og_hw - og_h * output_width; + auto og_w = og_hw - og_h * output_width; - int i_h = og_h * stride_height + h_offset; - int i_w = og_w * stride_width + w_offset; + auto i_h = og_h * stride_height + h_offset; + auto i_w = og_w * stride_width + w_offset; if (i_w >= 0 && i_w < input_width && i_h >= 0 && i_h < input_height) { - int input_offset = + auto input_offset = ((bid * input_channels + ic_id) * input_height + i_h) * input_width + i_w; - int output_grad_offset = (bid * output_channels + oc_id) * ohw + og_hw; + auto output_grad_offset = (bid * output_channels + oc_id) * ohw + og_hw; if (fuse_relu_before_conv) { f_grad += output_grad_data[output_grad_offset] * static_cast(max( @@ -1114,22 +1132,22 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( for (int kernel_id = threadIdx.x; kernel_id < output_channels; kernel_id += blockDim.x) { T s(0); - int gbid = + auto gbid = ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw; for (int image_w = threadIdx.y; image_w < output_width; image_w += blockDim.y) { - int kernel_h = kernel_ih * dilate_height - padding_height; - int kernel_w = kernel_iw * dilate_width - padding_width; + auto kernel_h = kernel_ih * dilate_height - padding_height; + auto kernel_w = kernel_iw * dilate_width - padding_width; - int image_hk = image_h * stride_height + kernel_h; - int image_wk = image_w * stride_width + kernel_w; + auto image_hk = image_h * stride_height + kernel_h; + auto image_wk = image_w * stride_width + kernel_w; if (image_hk < 0 || image_hk >= input_height) continue; if (image_wk < 0 || image_wk >= input_width) continue; - int input_id = + auto input_id = ((bid * input_height + image_hk) * input_width + image_wk) * input_channels + kernel_id / filter_multiplier; - int output_id = + auto output_id = ((bid * output_height + image_h) * output_width + image_w) * output_channels + kernel_id; @@ -1141,7 +1159,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( s += output_grad_data[output_id] * input_data[input_id]; } } - const int numel = output_channels * filter_width * filter_height; + const auto numel(output_channels * filter_width * filter_height); + NoReturnAtomicAdd(filter_grad_data, gbid, numel, s); } } @@ -1183,24 +1202,24 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( } for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) { int i_dw = i / wi_size; - int i_wi = i - i_dw * wi_size; - int image_w = i_wi * dilate_width + i_dw; + auto i_wi = i - i_dw * wi_size; + auto image_w = i_wi * dilate_width + i_dw; if (image_w >= output_width) { continue; } for (int kernel_ih = 0; kernel_ih < c_filter; ++kernel_ih) { for (int kernel_iw = 0; kernel_iw < c_filter; ++kernel_iw) { - int kernel_h = kernel_ih * dilate_height - padding_height; - int kernel_w = kernel_iw * dilate_width - padding_width; - int image_hk = image_h * stride_height + kernel_h; - int image_wk = image_w * stride_width + kernel_w; + auto kernel_h = kernel_ih * dilate_height - padding_height; + auto kernel_w = kernel_iw * dilate_width - padding_width; + auto image_hk = image_h * stride_height + kernel_h; + auto image_wk = image_w * stride_width + kernel_w; if (image_hk < 0 || image_hk >= input_height) continue; if (image_wk < 0 || image_wk >= input_width) continue; - int input_id = + auto input_id = ((bid * input_height + image_hk) * input_width + image_wk) * input_channels + kernel_id / filter_multiplier; - int output_id = + auto output_id = ((bid * output_height + image_h) * output_width + image_w) * output_channels + kernel_id; @@ -1249,32 +1268,35 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterSmallChannelNHWC( T* filter_grad_data) { const int bid = blockIdx.y; const int idx = blockIdx.x * blockDim.x + threadIdx.x; - const int numel = output_channels * c_filter * c_filter; + const auto numel(output_channels * c_filter * c_filter); + if (idx >= numel) { return; } const int tmp = idx / output_channels; - const int kernel_id = idx - tmp * output_channels; + const auto kernel_id(idx - tmp * output_channels); + const int kernel_ih = tmp / c_filter; - const int kernel_iw = tmp - kernel_ih * c_filter; + const auto kernel_iw(tmp - kernel_ih * c_filter); - const int h_offset = kernel_ih * dilate_height - padding_height; - const int w_offset = kernel_iw * dilate_width - padding_width; + const auto h_offset(kernel_ih * dilate_height - padding_height); + + const auto w_offset(kernel_iw * dilate_width - padding_width); T s(0); for (int og_h = 0; og_h < output_height; ++og_h) { for (int og_w = 0; og_w < output_width; ++og_w) { - int image_hk = og_h * stride_height + h_offset; - int image_wk = og_w * stride_width + w_offset; + auto image_hk = og_h * stride_height + h_offset; + auto image_wk = og_w * stride_width + w_offset; if (image_hk >= 0 && image_hk < input_height && image_wk >= 0 && image_wk < input_width) { - int input_id = + auto input_id = ((bid * input_height + image_hk) * input_width + image_wk) * input_channels + kernel_id / filter_multiplier; - int output_id = ((bid * output_height + og_h) * output_width + og_w) * - output_channels + - kernel_id; + auto output_id = ((bid * output_height + og_h) * output_width + og_w) * + output_channels + + kernel_id; if (fuse_relu_before_conv) { s += output_grad_data[output_id] * static_cast( @@ -1852,7 +1874,8 @@ class DepthwiseConvFilterGradFunctor multi_fpn_rois, std::vector multi_level_rois_num, DenseTensor* restore_index) { - int num_level = max_level - min_level + 1; + auto num_level = max_level - min_level + 1; // check that the fpn_rois is not empty if (!rois_num.get_ptr()) { diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu index 2e2f3dd127e9e4..e75f7203127c30 100644 --- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu +++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu @@ -53,7 +53,7 @@ __global__ void Levenshtein(T* dist, const int start) { int idx = blockDim.x * blockIdx.x + threadIdx.x; int offset = N; - int index = start + idx * offset; + auto index = start + idx * offset; int row = index / (N + 1); int col = index % (N + 1); if (row > 0 && col > 0 && row < M + 1 && col < N + 1) { @@ -171,12 +171,12 @@ void EditDistanceKernel(const Context& dev_ctx, // Compute the elements of distance matrix in the anti-diagonal direction for (int64_t slice = 2; slice < m + n + 1; ++slice) { - int z_m = slice < m + 1 ? 0 : slice - m; - int z_n = slice < n + 1 ? 0 : slice - n; - int size = slice - (z_m + z_n) + 1; // number of elements in the same - // anti-diagonal line to update + auto z_m = slice < m + 1 ? 0 : slice - m; + auto z_n = slice < n + 1 ? 0 : slice - n; + auto size = slice - (z_m + z_n) + 1; // number of elements in the same + // anti-diagonal line to update // the start index at which computes from - int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1; + auto start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1; Levenshtein<<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS, 0, diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index f3a2874b9234f5..d486c1359dd8bf 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -115,7 +115,7 @@ void GetGradXOrYOut(const GPUContext &dev_ctx, template static __global__ void SimpleElemwiseAddGradCUDAKernel( const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) { - int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; + auto tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; int stride = GRID_NUM_X * BLOCK_NUM_X; int loop = size / vec_size; int remainder = size % vec_size; diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu index 7f3a10a13efea9..d248ac8e533c09 100644 --- a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu @@ -212,9 +212,10 @@ void FlashAttnV3BaseKernel( } auto const sizes = q.dims(); - const int batch_size = !is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1; + const auto batch_size(!is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1); + int seqlen_q = !is_varlen_q ? sizes[1] : max_seqlen_q_; - int total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0]; + auto total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0]; int num_heads = q.dims()[q.dims().size() - 2]; int const head_size = q.dims()[q.dims().size() - 1]; int const head_size_v = v.dims()[v.dims().size() - 1]; @@ -522,7 +523,7 @@ void FlashAttnV3BaseKernel( // We don't need max_seqlen_k_new, so seqlen_k_new can be whatever when // is_varlen_k_new int seqlen_k_new = !is_varlen_k_new ? k_new.dims()[1] : 0; - int total_k_new = + auto total_k_new = !is_varlen_k_new ? batch_size * k_new.dims()[1] : k_new.dims()[0]; if (!is_varlen_k_new) { CHECK_SHAPE(k_new, batch_size, seqlen_k_new, num_heads_k, head_size); @@ -601,8 +602,8 @@ void FlashAttnV3BaseKernel( : ((params_is_causal && !is_varlen) || (is_varlen && params_num_splits > 1)); if (scheduler_needs_semaphore || use_dynamic_split) { - int metadata_size = static_cast(scheduler_needs_semaphore) + - static_cast(use_dynamic_split) * params_b; + auto metadata_size = static_cast(scheduler_needs_semaphore) + + static_cast(use_dynamic_split) * params_b; phi::dynload::fa3_fwd_params_set_skip_scheduler_metadata_computation( params_handle, scheduler_metadata_.is_initialized()); if (scheduler_metadata_.is_initialized()) { @@ -1372,9 +1373,10 @@ void FlashMaskV2BaseKernel( } auto const sizes = q.dims(); - const int batch_size = !is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1; + const auto batch_size(!is_varlen_q ? sizes[0] : cu_seqlens_q.dims()[0] - 1); + int seqlen_q = !is_varlen_q ? sizes[1] : max_seqlen_q_; - int total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0]; + auto total_q = !is_varlen_q ? batch_size * sizes[1] : sizes[0]; int num_heads = q.dims()[q.dims().size() - 2]; int const head_size = q.dims()[q.dims().size() - 1]; int const head_size_v = v.dims()[v.dims().size() - 1]; @@ -1684,7 +1686,7 @@ void FlashMaskV2BaseKernel( // We don't need max_seqlen_k_new, so seqlen_k_new can be whatever when // is_varlen_k_new int seqlen_k_new = !is_varlen_k_new ? k_new.dims()[1] : 0; - int total_k_new = + auto total_k_new = !is_varlen_k_new ? batch_size * k_new.dims()[1] : k_new.dims()[0]; if (!is_varlen_k_new) { CHECK_SHAPE(k_new, batch_size, seqlen_k_new, num_heads_k, head_size); @@ -1768,8 +1770,8 @@ void FlashMaskV2BaseKernel( : ((params_is_causal && !is_varlen) || (is_varlen && params_num_splits > 1)); if (scheduler_needs_semaphore || use_dynamic_split) { - int metadata_size = static_cast(scheduler_needs_semaphore) + - static_cast(use_dynamic_split) * params_b; + auto metadata_size = static_cast(scheduler_needs_semaphore) + + static_cast(use_dynamic_split) * params_b; phi::dynload:: flashmaskv2_fwd_params_set_skip_scheduler_metadata_computation( params_handle, scheduler_metadata_.is_initialized()); diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu b/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu index 346e329f7d9d4d..3c5cd3f4ecde72 100644 --- a/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu +++ b/paddle/phi/kernels/gpu/flash_attn_v3_utils.cu @@ -220,7 +220,7 @@ void set_params_fprop(Flash_fwd_params *params_handle, dynload::fa3_fwd_params_set_window_size_right(params_handle, window_size_right); - int arch = dprops.major * 10 + dprops.minor; + auto arch = dprops.major * 10 + dprops.minor; int num_sm = dprops.multiProcessorCount - sm_margin; dynload::fa3_fwd_params_set_arch(params_handle, arch); @@ -498,7 +498,7 @@ void set_flashmaskv2_params_fprop(Flash_fwd_params *params_handle, dynload::flashmaskv2_fwd_params_set_window_size_right(params_handle, window_size_right); - int arch = dprops.major * 10 + dprops.minor; + auto arch = dprops.major * 10 + dprops.minor; int num_sm = dprops.multiProcessorCount - sm_margin; dynload::flashmaskv2_fwd_params_set_arch(params_handle, arch); diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu index d96cde7884de70..abe41249d3c3b6 100644 --- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu @@ -210,7 +210,7 @@ static __global__ void FilterBBoxes(const T *bboxes, } __syncthreads(); if (threadIdx.x == 0) { - int size = (num - i) < BlockSize ? num - i : BlockSize; + auto size = (num - i) < BlockSize ? num - i : BlockSize; for (int j = 0; j < size; ++j) { if (keep_index[j] > -1) { keep[cnt++] = keep_index[j]; diff --git a/paddle/phi/kernels/gpu/global_gather_kernel.cu b/paddle/phi/kernels/gpu/global_gather_kernel.cu index c2efdc5af22204..1825502b3b6bc7 100644 --- a/paddle/phi/kernels/gpu/global_gather_kernel.cu +++ b/paddle/phi/kernels/gpu/global_gather_kernel.cu @@ -114,7 +114,7 @@ struct GlobalGatherFunctor { for (auto i = 0; i < n_expert; ++i) { comm_ctx->GroupStart(); for (auto j = 0; j < nranks; ++j) { - int idx = i + j * n_expert; + auto idx = i + j * n_expert; if (cpu_global_count_data[idx]) { auto send_buf = distributed::GetPartialTensor( *x, send_ptr * in_feat, cpu_global_count_data[idx] * in_feat); diff --git a/paddle/phi/kernels/gpu/global_scatter_kernel.cu b/paddle/phi/kernels/gpu/global_scatter_kernel.cu index 752b2aacf7e882..ff3e1817d9d4e8 100644 --- a/paddle/phi/kernels/gpu/global_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/global_scatter_kernel.cu @@ -114,7 +114,7 @@ struct GlobalScatterFunctor { for (auto i = 0; i < n_expert; ++i) { comm_ctx->GroupStart(); for (auto j = 0; j < nranks; ++j) { - int idx = i + j * n_expert; + auto idx = i + j * n_expert; if (cpu_local_count_data[idx]) { auto send_buf = distributed::GetPartialTensor( *x, diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu index 18eefe68f2033e..d47d9ea71f3aed 100644 --- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu @@ -388,7 +388,7 @@ void GroupNormGradKernel(const Context& dev_ctx, groups, std::min(max_grid_z, x_dims[0])); dim3 threads(block_size, 1, 1); - int flags = + auto flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; if (data_layout == DataLayout::kNCHW) { const int max_num_threads = 1024; @@ -486,7 +486,7 @@ void GroupNormGradKernel(const Context& dev_ctx, set_zero_AccT(dev_ctx, &temp_mean, static_cast(0)); auto* temp_mean_data = temp_mean.data(); - int flags = + auto flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu index dcedf1873286a3..9fb15881846782 100644 --- a/paddle/phi/kernels/gpu/group_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu @@ -825,7 +825,7 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, params_.eps = epsilon; auto stream = dev_ctx.stream(); DenseTensor redBuffer; - int buffer_sizes = 2 * params_.n * groups; + auto buffer_sizes = 2 * params_.n * groups; redBuffer.Resize({1, buffer_sizes}); params_.redBuffer = dev_ctx.template Alloc(&redBuffer); int64_t max_grid_x = dev_ctx.GetCUDAMaxGridDimSize()[0]; @@ -1201,7 +1201,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx, mean_data, temp_var_data); } - int flags = + auto flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormForward, diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h index 865ab91da7b1b3..0e490734edaa15 100644 --- a/paddle/phi/kernels/gpu/instance_norm_utils.h +++ b/paddle/phi/kernels/gpu/instance_norm_utils.h @@ -58,7 +58,8 @@ static __global__ void add_param(const T *input, for (int i = blockIdx.x; i < C; i += gridDim.x) { MPType ou = static_cast(0); for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) { - const int index = j * C + i; + const auto index(j * C + i); + ou = ou + static_cast(input[index]); } ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum()); diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index ed5f6438ab0c49..8d812a08f7bacb 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -79,7 +79,7 @@ struct ThreadAssigner<1> { const int cols_per_thread, int *last_tid_idx) { int cols_this_thread = cols_per_thread; - int last_tid = (cols / cols_per_thread); + auto last_tid = (cols / cols_per_thread); *last_tid_idx = last_tid; if (threadIdx.x == last_tid) { cols_this_thread = cols - cols_per_thread * last_tid; diff --git a/paddle/phi/kernels/gpu/lrn_grad_kernel.cu b/paddle/phi/kernels/gpu/lrn_grad_kernel.cu index e582bb7b6cbb91..aeba1c2da91df1 100644 --- a/paddle/phi/kernels/gpu/lrn_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/lrn_grad_kernel.cu @@ -34,7 +34,8 @@ __global__ void KeCMRNormDiff(int img_size, if (idx < img_size) { const int w = idx % W; const int h = (idx / W) % H; - const int n = idx / W / H; + const auto n(idx / W / H); + const int offset = (data_layout != DataLayout::kNHWC ? (n * C * H + h) * W + w : ((n * H + h) * W + w) * C); @@ -45,8 +46,9 @@ __global__ void KeCMRNormDiff(int img_size, x_g += offset; const int step = H * W; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; + const auto pre_pad(size - (size + 1) / 2); + + const auto post_pad(size - pre_pad - 1); int index = 0; T accum = 0; @@ -88,7 +90,7 @@ void CrossMapNormalGrad(const phi::GPUContext& dev_ctx, T alpha, T beta, const DataLayout data_layout) { - int img_size = N * H * W; + auto img_size = N * H * W; const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; diff --git a/paddle/phi/kernels/gpu/lrn_kernel.cu b/paddle/phi/kernels/gpu/lrn_kernel.cu index 41c9febf733942..ebf50dd00eb774 100644 --- a/paddle/phi/kernels/gpu/lrn_kernel.cu +++ b/paddle/phi/kernels/gpu/lrn_kernel.cu @@ -31,7 +31,8 @@ __global__ void KeCMRNormFillScale(int img_size, if (idx < img_size) { const int w = idx % W; const int h = (idx / W) % H; - const int n = idx / W / H; + const auto n(idx / W / H); + const int offset = (data_layout != DataLayout::kNHWC ? (n * C * H + h) * W + w : ((n * H + h) * W + w) * C); @@ -40,7 +41,7 @@ __global__ void KeCMRNormFillScale(int img_size, mid += offset; const int step = H * W; const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; + const auto post_pad(size - pre_pad - 1); T accum = 0; int index = 0; @@ -90,14 +91,14 @@ void CrossMapNormal(const phi::GPUContext& dev_ctx, T alpha, T beta, const DataLayout data_layout) { - int img_size = N * H * W; + auto img_size = N * H * W; const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; KeCMRNormFillScale<<>>( img_size, inputs, mid, C, H, W, n, k, alpha, data_layout); - int input_size = N * H * W * C; + auto input_size = N * H * W * C; grid_size = (input_size + block_size - 1) / block_size; KeCMRNormOutput<<>>( input_size, inputs, mid, -beta, outputs); diff --git a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu index 5184edec460c6e..6a43fea5a344d8 100644 --- a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu +++ b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu @@ -98,7 +98,8 @@ size_t CalcSortScoresPerClassWorkspaceSize(const int num, const int num_classes, const int num_preds_per_class) { size_t wss[4]; - const int array_len = num * num_classes * num_preds_per_class; + const auto array_len(num * num_classes * num_preds_per_class); + wss[0] = array_len * sizeof(T); // temp scores wss[1] = array_len * sizeof(int); // temp indices wss[2] = (num * num_classes + 1) * sizeof(int); // offsets @@ -180,7 +181,8 @@ __launch_bounds__(nthds_per_cta) __global__ if (cur_idx < num_preds_per_batch) { const int class_idx = cur_idx / num_preds_per_class; for (int i = 0; i < num; i++) { - const int target_idx = i * num_preds_per_batch + cur_idx; + const auto target_idx(i * num_preds_per_batch + cur_idx); + const T_SCORE score = conf_scores_gpu[target_idx]; // "Clear" background labeled score and index @@ -227,7 +229,8 @@ __launch_bounds__(nthds_per_cta) __global__ } if ((cur_idx % num_preds_per_class) == 0) { - const int offset_ct = i * num_classes + cur_idx / num_preds_per_class; + const auto offset_ct(i * num_classes + cur_idx / num_preds_per_class); + d_offsets[offset_ct] = offset_ct * num_preds_per_class; // set the last element in d_offset if (blockIdx.x == 0 && threadIdx.x == 0) @@ -251,7 +254,8 @@ void SortScoresPerClassGPU(gpuStream_t stream, const float score_shift) { const int num_segments = num * num_classes; void* temp_scores = workspace; - const int array_len = num * num_classes * num_preds_per_class; + const auto array_len(num * num_classes * num_preds_per_class); + void* temp_idx = GetNextWorkspacePtr(reinterpret_cast(temp_scores), array_len * sizeof(T_SCORE)); void* d_offsets = GetNextWorkspacePtr(reinterpret_cast(temp_idx), @@ -677,7 +681,8 @@ __launch_bounds__(nthds_per_cta) __global__ i += gridDim.x * nthds_per_cta) { const int imgId = i / keep_top_k; const int detId = i % keep_top_k; - const int offset = imgId * num_classes * top_k; + const auto offset(imgId * num_classes * top_k); + const int index = indices[offset + detId]; const T_SCORE score = scores[offset + detId]; if (index == -1) { diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu index 34c4a1391e3dfe..43c443de190341 100644 --- a/paddle/phi/kernels/gpu/multinomial_kernel.cu +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -70,7 +70,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs_data, int right = num_categories; while (right - left > 0) { - int mid = left + (right - left) / 2; + auto mid = left + (right - left) / 2; T temp_prob = cumulative_probs_data[mid]; if (temp_prob < rng_number) { diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu index 6df5941a1b794e..b41e3935c623b2 100644 --- a/paddle/phi/kernels/gpu/norm_kernel.cu +++ b/paddle/phi/kernels/gpu/norm_kernel.cu @@ -66,7 +66,8 @@ __global__ void Normalize(const T* x, } __syncthreads(); for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { - const int index = base + j * post; + const auto index(base + j * post); + y[index] = static_cast((static_cast(x[index]) / norm)); } } diff --git a/paddle/phi/kernels/gpu/prior_box_kernel.cu b/paddle/phi/kernels/gpu/prior_box_kernel.cu index 73049ae7572dc0..03e152e5078991 100644 --- a/paddle/phi/kernels/gpu/prior_box_kernel.cu +++ b/paddle/phi/kernels/gpu/prior_box_kernel.cu @@ -46,13 +46,13 @@ __global__ void GenPriorBox(T* out, const int min_num, bool is_clip, bool min_max_aspect_ratios_order) { - int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; - int box_num = height * width * num_priors; + auto num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; + auto box_num = height * width * num_priors; CUDA_KERNEL_LOOP(i, box_num) { int h = i / (num_priors * width); int w = (i / num_priors) % width; int p = i % num_priors; - int m = max_sizes ? p / (as_num + 1) : p / as_num; + auto m = max_sizes ? p / (as_num + 1) : p / as_num; T cx = (w + offset) * step_width; T cy = (h + offset) * step_height; T bw, bh; @@ -158,7 +158,7 @@ void PriorBoxKernel(const Context& dev_ctx, num_priors += max_sizes.size(); } int min_num = static_cast(min_sizes.size()); - int box_num = width * height * num_priors; + auto box_num = width * height * num_priors; int block = 512; int grid = (box_num + block - 1) / block; diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu index 13f0b12fa7e0d7..9f964ee1bce719 100644 --- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu @@ -97,7 +97,7 @@ __global__ void GPUPSROIPoolBackward(const int64_t nthreads, T diff_val = is_empty ? 0. : dout_data[i] / bin_area; for (int ih = hstart; ih < hend; ++ih) { for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; + auto input_index = ih * width + iw; phi::CudaAtomicAdd(offset_dx_data + input_index, diff_val); } } diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu index 1193c18131ce33..9a2065812c07b5 100644 --- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu @@ -52,7 +52,7 @@ __global__ void GPUPSROIPoolForward(const int nthreads, int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; + auto n = i / pooled_width / pooled_height / output_channels; // set roi_batch_id int roi_batch_id = rois_batch_id_data[n]; @@ -86,7 +86,7 @@ __global__ void GPUPSROIPoolForward(const int nthreads, wend = min(max(wend, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); - int input_channel = (c * pooled_height + ph) * pooled_width + pw; + auto input_channel = (c * pooled_height + ph) * pooled_width + pw; const T* offset_input_data = input_data + (roi_batch_id * input_channels + input_channel) * height * width; @@ -94,7 +94,7 @@ __global__ void GPUPSROIPoolForward(const int nthreads, for (int ih = hstart; ih < hend; ++ih) { for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; + auto input_index = ih * width + iw; outsum += offset_input_data[input_index]; } } diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu index 7144d89c72660e..6e6c76e27d1325 100644 --- a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu +++ b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu @@ -236,7 +236,7 @@ void RepeatInterleaveKernel(const Context& dev_ctx, } // Get actual dimension const int ndim = x.dims().size(); - const int target_dim = (dim < 0) ? ndim + dim : dim; + const auto target_dim((dim < 0) ? ndim + dim : dim); // Calculate sizes int64_t outer_size = 1; diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu index 20015f7b875952..292d52c80fb7f9 100644 --- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu @@ -59,9 +59,11 @@ void HostRMSNormGradient(const Context& dev_ctx, const int part_size = 16; const dim3 threads2(32, 4, 1); const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1); - const int nshared2_a = - 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1); - const int nshared2_b = threads2.x * threads2.y * sizeof(U); + const auto nshared2_a(2 * sizeof(U) * threads2.y * threads2.y * + (threads2.x + 1)); + + const auto nshared2_b(threads2.x * threads2.y * sizeof(U)); + const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b; std::vector shape = {part_size, n2}; DenseTensor part_grad_gamma( @@ -84,7 +86,8 @@ void HostRMSNormGradient(const Context& dev_ctx, const dim3 threads3(32, 8, 1); const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1); - const int nshared3 = threads3.x * threads3.y * sizeof(U); + const auto nshared3(threads3.x * threads3.y * sizeof(U)); + cuComputeGradGammaBeta<<>>( part_grad_gamma.data(), part_grad_gamma.data(), /* unused */ @@ -100,7 +103,7 @@ void HostRMSNormGradient(const Context& dev_ctx, const uint64_t maxGridY = dev_ctx.GetCUDAMaxGridDimSize()[1]; const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1); const dim3 threads1(32, 4, 1); - int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0; + auto nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0; const V* gamma_tmp = gamma; cuComputeGradInput<<>>( diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu index 3cb34f6eaedfbe..c3a46a94db4a2c 100644 --- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu @@ -47,7 +47,7 @@ void RollGradKernel(const Context& dev_ctx, shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel; } else { for (int i = 0; i < rank; i++) { - int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + auto dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); int64_t size = input_dim[dim]; if (size != 0) { shifts_data[i] = ((-shifts_data[i]) % size + size) % size; diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu index 318551221b1ffb..dcd4ae617cb970 100644 --- a/paddle/phi/kernels/gpu/roll_kernel.cu +++ b/paddle/phi/kernels/gpu/roll_kernel.cu @@ -47,7 +47,7 @@ void RollKernel(const Context& dev_ctx, shifts_data[0] = (shifts_data[0] % numel + numel) % numel; } else { for (int i = 0; i < rank; i++) { - int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + auto dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); int64_t size = input_dim[dim]; if (size != 0) { diff --git a/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu index ac61f86fed3e19..69eed3208adcda 100644 --- a/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu @@ -121,7 +121,7 @@ __global__ void RowConvGradFilterImproved(const T *in, int xdim_sh_in = block_y; int xdim_sh_dout = block_y; int ydim_sh_in = block_x; - int ydim_sh_dout = block_x + future_context - 1; + auto ydim_sh_dout = block_x + future_context - 1; int ydim_sh_dfilter = block_y; T *sh_in = mem; @@ -154,7 +154,7 @@ __global__ void RowConvGradFilterImproved(const T *in, __syncthreads(); if (thy < future_context - 1) { - int pos_offset = pos - future_context + 1; + auto pos_offset = pos - future_context + 1; sh_dout[thx * ydim_sh_dout + thy] = (d < input_dim && pos_offset >= start) ? dout[pos_offset * input_dim + d] diff --git a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu index 77ca140bd22ad2..84bd0c43478a13 100644 --- a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu @@ -68,7 +68,7 @@ struct SequenceExpandGradFunctor { ref_lod.size())); int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); int thread_y = 16; - int thread_z = 1024 / thread_x / thread_y; + auto thread_z = 1024 / thread_x / thread_y; int block_x = static_cast(ref_lod.size()); dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); diff --git a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu index 9c8817431efdbf..62d367916fae09 100644 --- a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu +++ b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu @@ -126,7 +126,7 @@ struct SequenceExpandFunctor { int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); int thread_y = 16; - int thread_z = 1024 / thread_x / thread_y; + auto thread_z = 1024 / thread_x / thread_y; int block_x = static_cast(ref_lod.size()); dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); diff --git a/paddle/phi/kernels/gpu/shuffle_channel.h b/paddle/phi/kernels/gpu/shuffle_channel.h index 59e067374e113d..50c574a1484e38 100644 --- a/paddle/phi/kernels/gpu/shuffle_channel.h +++ b/paddle/phi/kernels/gpu/shuffle_channel.h @@ -37,9 +37,11 @@ __global__ void ShuffleChannel(const int nthreads, int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; for (size_t ii = index; ii < nthreads; ii += offset) { - const int n = index / group_row / group_column / len; + const auto n(index / group_row / group_column / len); + const int i = (index / group_column / len) % group_row; - const int j = index / len % group_column; + const auto j(index / len % group_column); + const int k = index - (n * feature_map_size + (i * group_column + j) * len); T* p_o = output + n * feature_map_size + (j * group_row + i) * len; p_o[k] = input[index]; diff --git a/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu b/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu index 3c130e4ec56751..3bad8066dc8611 100644 --- a/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu @@ -40,7 +40,7 @@ void ShuffleChannelGradOpCUDAKernel(const Context& dev_ctx, int blocks = NumBlocks(out_grad.numel()); int threads = kNumCUDAThreads; - int count = num * group_column * group_row * sp_sz; + auto count = num * group_column * group_row * sp_sz; ShuffleChannel<<>>(count, feature_map_size, diff --git a/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu b/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu index 6348a486f2e735..0270753c61900c 100644 --- a/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu +++ b/paddle/phi/kernels/gpu/shuffle_channel_kernel.cu @@ -36,7 +36,7 @@ void ShuffleChannelOpCUDAKernel(const Context& dev_ctx, int group_row = group; int group_column = channel / group_row; // count is the product of NCHW same as numel() - int count = num * group_column * group_row * sp_sz; + auto count = num * group_column * group_row * sp_sz; int blocks = NumBlocks(out->numel()); int threads = kNumCUDAThreads; diff --git a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu index fde94d4b70a188..81995c36ded47a 100644 --- a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu +++ b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu @@ -268,7 +268,7 @@ __global__ void GetSlogDetV2FromLU(const T* lu_data, T* logdet_data) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < batch_size) { - int offset_lu = idx * n * n; + auto offset_lu = idx * n * n; int offset_ipiv = idx * n; T det_val = T(1.0); for (int i = 0; i < n; i++) { diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu index d7df2581f9656e..a2f6a2eb39fbca 100644 --- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu +++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu @@ -529,7 +529,7 @@ __global__ void KeMatrixTopPBeamTopKFt(const T* src, count_iter_begin[bid] += 1; if (val < threshold_now) { // don't sample low score token - int start_id = i == 0 ? 0 : i - 1; + auto start_id = i == 0 ? 0 : i - 1; for (int j = start_id; j >= 0; j--) { float val_now = static_cast(beam_max[j].v); if (val_now >= threshold_now || j == 0) { diff --git a/paddle/phi/kernels/gpu/tril_indices_kernel.cu b/paddle/phi/kernels/gpu/tril_indices_kernel.cu index be83f28451166b..1193284f074bef 100644 --- a/paddle/phi/kernels/gpu/tril_indices_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_indices_kernel.cu @@ -24,7 +24,7 @@ namespace phi { template __device__ inline int resolve_root_int(int b, int cX4, int x, int32_t sign) { - int bXb_cX4 = b * b - cX4; + auto bXb_cX4 = b * b - cX4; double sr = ::sqrt(static_cast(bXb_cX4)); T res = ::__double2ll_rd((-b + sign * sr) / 2); if (bXb_cX4 != static_cast(sr * sr)) { diff --git a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu index 98d2bfbea0743b..7bddfce64af2a8 100644 --- a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu @@ -37,7 +37,7 @@ __global__ void KernelUnpool2dMaxGrad(const int64_t nthreads, T* input_grad) { CUDA_KERNEL_LOOP_TYPE(linearIndex, nthreads, int64_t) { int c = (linearIndex / input_width / input_height) % channels; - int n = linearIndex / input_width / input_height / channels; + auto n = linearIndex / input_width / input_height / channels; output_grad += (n * channels + c) * output_height * output_width; IndT maxind = indices_data[linearIndex]; input_grad[linearIndex] = output_grad[maxind]; @@ -60,7 +60,7 @@ __global__ void KernelUnpool3dMaxGrad(const int64_t nthreads, T* input_grad) { CUDA_KERNEL_LOOP_TYPE(linearIndex, nthreads, int64_t) { int c = (linearIndex / input_depth / input_width / input_height) % channels; - int n = linearIndex / input_depth / input_width / input_height / channels; + auto n = linearIndex / input_depth / input_width / input_height / channels; output_grad += (n * channels + c) * output_depth * output_height * output_width; IndT maxind = indices_data[linearIndex]; diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu index af6169ba9cb7b1..d0897612fa65b3 100644 --- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu +++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu @@ -233,7 +233,7 @@ void ViterbiDecodeKernel(const Context& dev_ctx, std::vector historys; // We create tensor buffer in order to avoid allocating memory frequently // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero... - int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size; + auto buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size; DenseTensor int_buffer = Empty(dev_ctx, {buffer_size}); funcs::TensorBuffer int_tensor_buffer(int_buffer); // create float tensor buffer diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu index ce22758e407862..82dac78871fe01 100644 --- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu +++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu @@ -49,7 +49,7 @@ __device__ __forceinline__ float GenKeyFromWeight( random_num2 = rng.Random64(); seed_count++; } while (!random_num2); - int one_bit = __clzll(random_num2) + seed_count * 64; + auto one_bit = __clzll(random_num2) + seed_count * 64; u *= exp2f(-one_bit); float logk = (log1pf(u) / logf(2.0)) * (1 / weight); return logk; @@ -261,7 +261,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ const int tx = threadIdx.x; #pragma unroll for (int j = 0; j < ITEMS_PER_THREAD; j++) { - int idx = BLOCK_SIZE * j + tx; + auto idx = BLOCK_SIZE * j + tx; if (idx < neighbor_count) { float thread_weight = edge_weight[start + idx]; weight_keys[j] = GenKeyFromWeight(thread_weight, rng); @@ -274,14 +274,14 @@ __launch_bounds__(BLOCK_SIZE) __global__ BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped( weight_keys, neighbor_idxs, max_sample_count, valid_count); __syncthreads(); - const int stride = BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count; + const auto stride(BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count); for (int idx_offset = ITEMS_PER_THREAD * BLOCK_SIZE; idx_offset < neighbor_count; idx_offset += stride) { #pragma unroll for (int j = 0; j < ITEMS_PER_THREAD; j++) { - int local_idx = BLOCK_SIZE * j + tx - max_sample_count; + auto local_idx = BLOCK_SIZE * j + tx - max_sample_count; int target_idx = idx_offset + local_idx; if (local_idx >= 0 && target_idx < neighbor_count) { float thread_weight = edge_weight[start + target_idx]; @@ -299,7 +299,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ } #pragma unroll for (int j = 0; j < ITEMS_PER_THREAD; j++) { - int idx = j * BLOCK_SIZE + tx; + auto idx = j * BLOCK_SIZE + tx; if (idx < max_sample_count) { sample_output[offset + idx] = in_rows[start + neighbor_idxs[j]]; if (return_eids) { diff --git a/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu index a4821e6534463d..c1cea289d6b09c 100644 --- a/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu +++ b/paddle/phi/kernels/gpu/yolo_box_head_kernel.cu @@ -40,7 +40,7 @@ __global__ void YoloBoxHeadCudaKernel(const T* input, return; } const int grids_num = grid_size_x * grid_size_y; - const int bbindex = y_id * grid_size_x + x_id; + const auto bbindex(y_id * grid_size_x + x_id); // objectness output[bbindex + grids_num * (z_id * (5 + class_num) + 4)] = @@ -81,7 +81,8 @@ void YoloBoxHeadKernel(const Context& dev_ctx, const T* input_data = x.data(); T* output_data = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); auto stream = dev_ctx.stream(); - const int volume = x_dims[1] * h * w; + const auto volume(x_dims[1] * h * w); + dim3 block(16, 16, 4); dim3 grid((grid_size_x / block.x) + 1, (grid_size_y / block.y) + 1, diff --git a/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu index 1e2613c5cab773..7200459df3fe7c 100644 --- a/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu +++ b/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu @@ -147,7 +147,8 @@ __global__ void YoloBoxNum(const float* input, } const int grids_num = grid_size * grid_size; - const int bbindex = y_id * grid_size + x_id; + const auto bbindex(y_id * grid_size + x_id); + float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)]; if (objectness < prob_thresh) { return; @@ -178,7 +179,8 @@ __global__ void YoloTensorParseKernel(const float* input, const float pic_h = im_shape_data[0] / im_scale_data[0]; const float pic_w = im_shape_data[1] / im_scale_data[1]; const int grids_num = grid_size * grid_size; - const int bbindex = y_id * grid_size + x_id; + const auto bbindex(y_id * grid_size + x_id); + float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)]; if (objectness < prob_thresh) { return; @@ -434,7 +436,7 @@ void YoloBoxPostKernel(const Context& dev_ctx, int c = boxes_input_dims[input_id][1]; int h = boxes_input_dims[input_id][2]; int w = boxes_input_dims[input_id][3]; - int ts_id = batch_id * boxes_input.size() + input_id; + auto ts_id = batch_id * boxes_input.size() + input_id; int bbox_count_max_alloc = ts_info[ts_id].bbox_count_max_alloc; YoloTensorParseCuda( @@ -494,7 +496,7 @@ void YoloBoxPostKernel(const Context& dev_ctx, for (int batch_id = 0; batch_id < batch; batch_id++) { std::vector bbox_det_vec; for (int input_id = 0; input_id < boxes_input.size(); input_id++) { - int ts_id = batch_id * boxes_input.size() + input_id; + auto ts_id = batch_id * boxes_input.size() + input_id; int bbox_count = ts_info[ts_id].bbox_count_host; if (bbox_count <= 0) { continue; diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 6a4280faa4aea0..380f448aa829ce 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -129,8 +129,8 @@ void ConvCudnnGradKernelImplV7( &o_w); } - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; + auto group_offset_in = i_c / groups * i_h * i_w * i_d; + auto group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = transformed_filter_channel->numel() / groups; // ------------------- cudnn backward algorithm --------------------- @@ -1186,8 +1186,8 @@ void ConvCudnnGradGradKernel( &o_h, &o_w); - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; + auto group_offset_in = i_c / groups * i_h * i_w * i_d; + auto group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = W->numel() / groups; ScalingParamType alpha = 1.0f; diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 42ff83420526a9..47a7f6f58ca19b 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -142,8 +142,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, &o_w); } - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; + auto group_offset_in = i_c / groups * i_h * i_w * i_d; + auto group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_filter = transformed_filter_channel->numel() / groups; // ------------------- cudnn conv workspace --------------------- size_t workspace_size = 0; // final workspace to allocate. diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index ee222d3291b3cd..362891fc2eb067 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -290,8 +290,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, // ------------------- cudnn conv backward data --------------------- // FIxME(typhoonzero): template type T may not be the same as cudnn call. - int x_offset = x.numel() / x.dims()[0] / groups; - int dout_offset = + auto x_offset = x.numel() / x.dims()[0] / groups; + auto dout_offset = transformed_dout.numel() / transformed_dout.dims()[0] / groups; int filter_offset = filter.numel() / groups; ScalingParamType alpha = 1.0f; @@ -835,9 +835,9 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( &o_h, &o_w); - int group_offset_in = + auto group_offset_in = transformed_x.numel() / transformed_x.dims()[0] / groups; - int group_offset_out = + auto group_offset_out = transformed_dout.numel() / transformed_dout.dims()[0] / groups; int group_offset_filter = filter.numel() / groups; diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 26b8827620c759..6a8e01d3d94bf8 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -114,8 +114,8 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, #endif // ------------------- cudnn conv transpose forward --------------------- - int x_offset = transformed_x->numel() / transformed_x->dims()[0] / groups; - int out_offset = + auto x_offset = transformed_x->numel() / transformed_x->dims()[0] / groups; + auto out_offset = transformed_out->numel() / transformed_out->dims()[0] / groups; int filter_offset = filter->numel() / groups; ScalingParamType alpha = 1.0f; diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 7706299a92d92c..240f1a4f3c7216 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -1295,7 +1295,7 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx, // use 128 threads per block to maximize gpu utilization constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / warp_size); + auto warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; IndexType blocks = (N + batches_per_block - 1) / batches_per_block; dim3 threads(warp_size, warps_per_block, 1); @@ -1389,7 +1389,7 @@ void SoftmaxBackwardCUDAKernelDriverImpl(const GPUContext& dev_ctx, constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / warp_size); + auto warps_per_block = (threads_per_block / warp_size); int batches_per_block = warps_per_block * batches_per_warp; IndexType blocks = (N + batches_per_block - 1) / batches_per_block; dim3 threads(warp_size, warps_per_block, 1); diff --git a/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h b/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h index aee6bd1e5ab9cc..40abc56e5b7c4b 100644 --- a/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h +++ b/paddle/phi/kernels/impl/anchor_generator_kernel_impl.h @@ -112,7 +112,7 @@ void AnchorGeneratorOpKernel(const Context& dev_ctx, var_et(0, i) = variances[i]; } - int anchor_num = feature_height * feature_width * num_anchors; + auto anchor_num = feature_height * feature_width * num_anchors; auto var_dim = vars->dims(); vars->Resize({anchor_num, static_cast(variances.size())}); diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h index fe734bfb3dc441..b022b7774dfa1f 100644 --- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h +++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h @@ -50,8 +50,8 @@ void ApplyBroadcast(const Context& dev_ctx, Eigen::DSizes bcast_dims; std::vector new_input_dims_vec(out_rank); for (int i = 0; i < out_rank; i++) { - int in_axis = in_rank - i - 1; - int out_axis = out_rank - i - 1; + auto in_axis = in_rank - i - 1; + auto out_axis = out_rank - i - 1; bcast_dims[out_axis] = output_dims[out_axis]; new_input_dims_vec[out_axis] = 1; diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h index f4dd48013d4b04..32c2eaba5ad20e 100644 --- a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h @@ -66,9 +66,11 @@ struct EyeFunctor { HOSTDEVICE void operator()(size_t index) const { const int global_row = index / n_; - const int col = index - global_row * n_; + const auto col(index - global_row * n_); + const int batch = global_row / m_; - const int row = global_row - batch * m_; + const auto row(global_row - batch * m_); + output_[index] = col == row ? static_cast(1) : static_cast(0); } @@ -103,10 +105,12 @@ struct MatrixSetDiagFunctor { HOSTDEVICE void operator()(size_t index) const { const int batch_and_diag_index = index / max_diag_len_; - const int index_in_the_diagonal = - index - batch_and_diag_index * max_diag_len_; + const auto index_in_the_diagonal(index - + batch_and_diag_index * max_diag_len_); + const int batch = batch_and_diag_index / num_diags_; - const int diag_index_in_input = batch_and_diag_index - batch * num_diags_; + const auto diag_index_in_input(batch_and_diag_index - batch * num_diags_); + // diag_index=0 refers to the main diagonal const int diag_index = upper_diag_index_ - diag_index_in_input; // shift down for subdiagonal if diag_index < 0 @@ -119,7 +123,8 @@ struct MatrixSetDiagFunctor { // Upper-bound checks for diagonals shorter than max_diag_len. // y_index and x_index are nonnegative by construction. if (y_index < m_ && x_index < n_) { - const int out_index = batch * m_ * n_ + y_index * n_ + x_index; + const auto out_index(batch * m_ * n_ + y_index * n_ + x_index); + output_[out_index] = diag_[index]; } } @@ -152,11 +157,13 @@ struct MatrixDiagPartFunctor { HOSTDEVICE void operator()(size_t index) const { const int batch_and_mapped_diag_index = index / max_diag_len_; - const int index_in_the_diagonal = - index - batch_and_mapped_diag_index * max_diag_len_; + const auto index_in_the_diagonal(index - batch_and_mapped_diag_index * + max_diag_len_); + const int batch = batch_and_mapped_diag_index / num_diags_; - const int mapped_diag_index = - batch_and_mapped_diag_index - batch * num_diags_; + const auto mapped_diag_index(batch_and_mapped_diag_index - + batch * num_diags_); + // diag_index=0 refers to the main diagonal const int diag_index = upper_diag_index_ - mapped_diag_index; // shift down for subdiagonal if diag_index < 0 diff --git a/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h b/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h index 761fc62e20b263..3edb2b9495d52d 100644 --- a/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h +++ b/paddle/phi/kernels/impl/collect_fpn_proposals_kernel_impl.h @@ -109,9 +109,10 @@ void CollectFpnProposalsOpKernel( integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois; } - const int batch_size = (num_size == 0) - ? multi_layer_rois[0]->lod().back().size() - 1 - : multi_rois_num[0]->numel(); + const auto batch_size((num_size == 0) + ? multi_layer_rois[0]->lod().back().size() - 1 + : multi_rois_num[0]->numel()); + // concatenate all fpn rois scores into a list // create a vector to store all scores std::vector> scores_of_all_rois( diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h index c6dd1cf7df4871..7e76e9489a3a23 100644 --- a/paddle/phi/kernels/impl/diag_embed_impl.h +++ b/paddle/phi/kernels/impl/diag_embed_impl.h @@ -83,8 +83,8 @@ void DiagEmbedKernel(const Context& dev_ctx, set_zero(dev_ctx, out, static_cast(0.0)); auto out_dims = out->dims(); - int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1; - int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2; + auto dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1; + auto dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2; auto stride = common::stride(out_dims); int64_t diag_size; int64_t storage_offset = 0; diff --git a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h index f204d2efdc6d3a..1a9154d48fb0a6 100644 --- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h @@ -40,14 +40,14 @@ void FoldGradKernel(const Context& dev_ctx, const auto& x_dims = x_grad->dims(); const int64_t batch_size = x_dims[0]; - int output_height = (output_sizes[0] + 2 * paddings[0] - - (dilations[0] * (kernel_sizes[0] - 1) + 1)) / - strides[0] + + auto output_height = (output_sizes[0] + 2 * paddings[0] - + (dilations[0] * (kernel_sizes[0] - 1) + 1)) / + strides[0] + + 1; + auto output_width = (output_sizes[1] + 2 * paddings[1] - + (dilations[1] * (kernel_sizes[1] - 1) + 1)) / + strides[1] + 1; - int output_width = (output_sizes[1] + 2 * paddings[1] - - (dilations[1] * (kernel_sizes[1] - 1) + 1)) / - strides[1] + - 1; int64_t n_input_plane = x_dims[1]; int64_t n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]); diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h index a0ac45d3c6bfe6..a84a5577fef891 100644 --- a/paddle/phi/kernels/impl/fold_kernel_impl.h +++ b/paddle/phi/kernels/impl/fold_kernel_impl.h @@ -39,14 +39,14 @@ void FoldKernel(const Context& dev_ctx, phi::funcs::Col2ImFunctor col2im; const auto& x_dims = x.dims(); - int output_height = (output_sizes[0] + 2 * paddings[0] - - (dilations[0] * (kernel_sizes[0] - 1) + 1)) / - strides[0] + + auto output_height = (output_sizes[0] + 2 * paddings[0] - + (dilations[0] * (kernel_sizes[0] - 1) + 1)) / + strides[0] + + 1; + auto output_width = (output_sizes[1] + 2 * paddings[1] - + (dilations[1] * (kernel_sizes[1] - 1) + 1)) / + strides[1] + 1; - int output_width = (output_sizes[1] + 2 * paddings[1] - - (dilations[1] * (kernel_sizes[1] - 1) + 1)) / - strides[1] + - 1; int64_t n_input_plane = x_dims[1]; int64_t n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]); diff --git a/paddle/phi/kernels/impl/im2sequence_kernel_impl.h b/paddle/phi/kernels/impl/im2sequence_kernel_impl.h index a6265e5b30836f..444291aabd005a 100644 --- a/paddle/phi/kernels/impl/im2sequence_kernel_impl.h +++ b/paddle/phi/kernels/impl/im2sequence_kernel_impl.h @@ -26,8 +26,9 @@ namespace phi { inline int Im2SeqOutputSize( int input_size, int filter_size, int padding_0, int padding_1, int stride) { - const int output_size = - (input_size + padding_0 + padding_1 - filter_size) / stride + 1; + const auto output_size( + (input_size + padding_0 + padding_1 - filter_size) / stride + 1); + return output_size; } diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h index 1a23e6d845781d..95c9028aa1cdc0 100644 --- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h @@ -240,7 +240,7 @@ __global__ void ReduceAbsMaxKernel(const T* x, for (int row_idx = blockIdx.x; row_idx < rows; row_idx += gridDim.x) { for (int col_idx = threadIdx.x * VecSize; col_idx < cols; col_idx += blockDim.x * VecSize) { - int32_t linear_index = row_idx * cols + col_idx; + auto linear_index = row_idx * cols + col_idx; phi::Load(x + linear_index, &in_vec); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -284,7 +284,7 @@ __global__ void QuantActKernel(const T* x, linear_index < elem_cnt; linear_index += gridDim.x * blockDim.x * VecSize) { int row_idx = linear_index / cols; - int col_idx = + auto col_idx = linear_index - row_idx * cols; // equal to linear_index % cols phi::Load(x + linear_index, &in_vec); int32_t local_outlier_idx = outlier_idx[col_idx / 32]; @@ -353,13 +353,13 @@ __global__ void SplitKernel(const T* x, if (linear_idx < sub_w_elem_cnt) { constexpr int32_t k_permute_const = 8; int32_t k_mod_16 = k_id % 16; - int32_t temp_k_expr_1 = k_mod_16 - k_mod_16 / 8 * 8; + auto temp_k_expr_1 = k_mod_16 - k_mod_16 / 8 * 8; int32_t temp_k_expr_2 = k_mod_16 / 8; - int32_t permute_kk = temp_k_expr_1 + temp_k_expr_2 + - (temp_k_expr_2 + 1) % 2 * k_mod_16 * 2 / 2 + - temp_k_expr_1 * temp_k_expr_2 + k_id / 16 * 16; - int32_t permute_index = permute_kk % 64 + permute_kk / 64 * 128 + - 64 * (row_idx % 2) + k * 2 * (row_idx / 2); + auto permute_kk = temp_k_expr_1 + temp_k_expr_2 + + (temp_k_expr_2 + 1) % 2 * k_mod_16 * 2 / 2 + + temp_k_expr_1 * temp_k_expr_2 + k_id / 16 * 16; + auto permute_index = permute_kk % 64 + permute_kk / 64 * 128 + + 64 * (row_idx % 2) + k * 2 * (row_idx / 2); int8_t shifted_weight = static_cast( static_cast(weight[permute_index]) - 128); sub_weight[row_idx * kfp_num + col_idx] = @@ -431,7 +431,7 @@ __global__ void DequantMergeKernel(const int32_t* x, for (int row_idx = blockIdx.x; row_idx < m; row_idx += gridDim.x) { for (int col_idx = threadIdx.x * VecSize; col_idx < n; col_idx += blockDim.x * VecSize) { - int linear_idx = row_idx * n + col_idx; + auto linear_idx = row_idx * n + col_idx; phi::Load(x_fp + linear_idx, &x_fp_vec); phi::Load(x + linear_idx, &x_vec); #pragma unroll diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h index 6f03f76eebbf23..d9b621b2e75683 100644 --- a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h @@ -26,7 +26,8 @@ struct IdentityMatrixFunctor { IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} HOSTDEVICE void operator()(size_t index) const { - const int row = index / m_ % m_; + const auto row(index / m_ % m_); + const int col = index % m_; output_[index] = col == row ? static_cast(1) : static_cast(0); } diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h index 3546b91d66fc12..c17a2a9ed20e9d 100644 --- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h @@ -40,7 +40,7 @@ void UnStackGradKernel(const Context &dev_ctx, for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; #if defined(__NVCC__) || defined(__HIPCC__) - int total_num = pre * n * post; + auto total_num = pre * n * post; thrust::device_vector device_x_vec(x_datas); auto x_data_arr = device_x_vec.data().get(); diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h index 82c78aad85e5ef..1723068a33afe9 100644 --- a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h +++ b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h @@ -150,7 +150,7 @@ __global__ void weight_interleave_add_bias_kernel_wint4(int8_t* input_data_dev, #pragma unroll for (int idx = 0; idx < 8; ++idx) { const int offset = idx / 4; - const int src = (idx % 4) * 2 + offset; + const auto src((idx % 4) * 2 + offset); const int src_shift = src * 4; const int dst_shift = idx * 4; @@ -314,7 +314,7 @@ __global__ void per_channel_quant_gpu_int4_row_pack(const T* weight_data, for (int i = 0; i < VectorSize / 2; ++i) { int8_t packed_int4s = 0; for (int pack = 0; pack < 2; ++pack) { - int vector_index = i * 2 + pack; + auto vector_index = i * 2 + pack; const float r_scale = 1 / static_cast(scale[vector_index]); const float weight_elt = static_cast(weight[vector_index]) * r_scale; diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h index bd1c4b1d865af2..eab8135a221740 100644 --- a/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h +++ b/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h @@ -224,8 +224,9 @@ void add_bias_and_interleave_inplace(int8_t* tensor_ptr, size_t num_elts) { uint32_t transformed_register = 0; for (int dest_idx = 0; dest_idx < 8; ++dest_idx) { - const int src_idx = - dest_idx < 4 ? 2 * dest_idx : 2 * (dest_idx - 4) + 1; + const auto src_idx(dest_idx < 4 ? 2 * dest_idx + : 2 * (dest_idx - 4) + 1); + const int src_shift = 4 * src_idx; const int dest_shift = 4 * dest_idx; @@ -264,8 +265,9 @@ void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor, for (int tile_row = 0; tile_row < B_ROWS_PER_MMA; ++tile_row) { for (int write_col = 0; write_col < num_vec_cols; ++write_col) { const int write_row = base_row + tile_row; - const int tile_read_row = 8 * (((tile_row % ELTS_PER_REG) / 2)) + - tile_row % 2 + 2 * (tile_row / ELTS_PER_REG); + const auto tile_read_row(8 * (((tile_row % ELTS_PER_REG) / 2)) + + tile_row % 2 + 2 * (tile_row / ELTS_PER_REG)); + const int read_row = base_row + tile_read_row; const int read_col = write_col; diff --git a/paddle/phi/kernels/onednn/concat_kernel.cc b/paddle/phi/kernels/onednn/concat_kernel.cc index 2e7d79a330cee7..a72725909ea744 100644 --- a/paddle/phi/kernels/onednn/concat_kernel.cc +++ b/paddle/phi/kernels/onednn/concat_kernel.cc @@ -74,7 +74,7 @@ class ConcatOneDNNHandler : public OneDNNHandlerNoCachingT { bool ConcatCheckIfOneDNNSupport(const KernelContext* dev_ctx) { auto input0 = dev_ctx->InputAt(0); - int batch_size = + auto batch_size = !input0.lod().empty() ? input0.lod()[0].size() - 1 : input0.dims()[0]; if (dev_ctx->InputsSize() > 64 && batch_size < 1000) { return false; diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc index b1b6db198e3a12..0325d980559e26 100644 --- a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc @@ -52,9 +52,9 @@ void CalculateMatrixDims(const std::vector &x_dims, for (size_t i = 0; i < x_bd_dims->size() - 2; ++i) { (*out_bd_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); } - int h_idx = + auto h_idx = trans_x ? x_bd_dims->size() - 1 : x_bd_dims->size() - 2; // NOLINT - int w_idx = + auto w_idx = trans_y ? y_bd_dims->size() - 2 : y_bd_dims->size() - 1; // NOLINT (*out_bd_dims)[x_bd_dims->size() - 2] = (*x_bd_dims)[h_idx]; diff --git a/paddle/phi/kernels/onednn/multi_gru_kernel.cc b/paddle/phi/kernels/onednn/multi_gru_kernel.cc index 5a7250fc312fc6..debfb9c6d1194a 100644 --- a/paddle/phi/kernels/onednn/multi_gru_kernel.cc +++ b/paddle/phi/kernels/onednn/multi_gru_kernel.cc @@ -145,11 +145,11 @@ class MultiGRUHandler { layers_ * 2, scale_weights.size())); - const int weights_scale_mask = + const auto weights_scale_mask( 0 + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` - + - (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` + + (1 << 4)); + // bit, indicating the unique scales for `o` dim in `ldigo` int w_scale_num = scale_weights.size(); for (int i = 0; i < w_scale_num; ++i) { @@ -371,7 +371,7 @@ class MultiGRUHandler { auto* weight_x_data = reinterpret_cast(user_memory.get_data_handle()); - int idx = layer * 2 + (dir == R2L); + auto idx = layer * 2 + (dir == R2L); memcpy(weight_x_data, weights_x_[idx]->data(), sizeof(float) * ICs[layer] * 3 * OCs[layer]); @@ -414,7 +414,7 @@ class MultiGRUHandler { auto* weight_h_data = reinterpret_cast(user_memory.get_data_handle()); - int idx = layer * 2 + (dir == R2L); + auto idx = layer * 2 + (dir == R2L); auto* user_weight_h_data = weights_h_[idx]->data(); auto src1_iter = user_weight_h_data; @@ -465,7 +465,7 @@ class MultiGRUHandler { gru_pds_[{layer, dir}]->bias_desc(), engine_); auto* bias_data = reinterpret_cast(memory_p->get_data_handle()); - int idx = layer * 2 + (dir == R2L); + auto idx = layer * 2 + (dir == R2L); if (!biases_.empty() && biases_[idx]) { const float* user_bias_data = biases_[idx]->data(); // Bias in oneDNN is always float diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h index 10983a4ef75290..8c0a3f9c588c3d 100644 --- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h +++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h @@ -30,7 +30,7 @@ inline std::vector CalculateReducedDims( std::vector output_dims(common::vectorize(input->dims())); for (size_t i = 0; i < dims.size(); ++i) { // handle negative dims, f.e. "-1" means rightmost dimension - int index = (dims[i] >= 0) ? dims[i] : input->dims().size() + dims[i]; + auto index = (dims[i] >= 0) ? dims[i] : input->dims().size() + dims[i]; output_dims[index] = 1; } diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 11481a8b0249a8..c1f05936e5a668 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -543,7 +543,7 @@ __device__ __forceinline__ void Cumsum(OutT* out, temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1]; for (int stride = 1; stride <= stride_size; stride *= 2) { __syncthreads(); - int index = (tidx + 1) * 2 * stride - 1; + auto index = (tidx + 1) * 2 * stride - 1; if (index < (blockDim.x * 2)) { temp[index + index / 32] = compute(temp[index + index / 32], @@ -552,7 +552,7 @@ __device__ __forceinline__ void Cumsum(OutT* out, } for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) { __syncthreads(); - int index = (tidx + 1) * 2 * stride - 1; + auto index = (tidx + 1) * 2 * stride - 1; if ((index + stride) < (blockDim.x * 2)) { temp[index + stride + (stride + index) / 32] = compute(temp[index + stride + (stride + index) / 32], diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h index 75f510c13d18ff..e94e9b2916f6e9 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -140,7 +140,7 @@ struct BroadcastConfig { return kps::details::GetXpuReadLens(numel, 8, 64); } int max_buf_len = 512; - int buf_len = m / 16 * 16; + auto buf_len = m / 16 * 16; if (buf_len == 0) { buf_len = m; } @@ -408,7 +408,7 @@ __device__ __inline__ void ReadData(Ty* dst, break; } } - int fix = thread_offset + idx * stride_nx + idy * stride_ny; + auto fix = thread_offset + idx * stride_nx + idy * stride_ny; mfence_local(); GM2LM(src + fix, in_temp, sizeof(Tx)); dst[idy * NX + idx] = static_cast(in_temp[0]); @@ -1194,7 +1194,7 @@ __device__ __inline__ void ReadDataBc(T* dst, const details::BroadcastConfig& config, int total_num_output, int read_lens) { - int thread_offset = block_offset + core_id() * read_lens; + auto thread_offset = block_offset + core_id() * read_lens; if (config.cmp_type == details::OptType::MNK_M1K) { ReadDataBcM1kMnk(dst, src, thread_offset, config, read_lens); @@ -1248,7 +1248,7 @@ __device__ __forceinline__ void ReadDataBc( const details::BroadcastConfig& config, int total_num_output, int read_lens = NX) { - int thread_offset = block_offset + core_id() * read_lens; + auto thread_offset = block_offset + core_id() * read_lens; __local__ T in_temp[NX]; if (config.cmp_type == details::OptType::MNK_M1K) { @@ -1286,7 +1286,7 @@ __device__ __forceinline__ void ReadDataBc( */ template __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) { - int thread_offset = block_offset + core_id() * NX; + auto thread_offset = block_offset + core_id() * NX; #pragma unroll for (int nx = 0; nx < NX; ++nx) { dst[nx] = static_cast(thread_offset + nx); diff --git a/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu b/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu index 437094d1422d35..978869851fb096 100644 --- a/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu +++ b/paddle/phi/kernels/stride/reduce_grad_stride_kernel.cu @@ -55,7 +55,7 @@ phi::DenseTensor CheckMultipleUnsqueeze(const Context& dev_ctx, std::vector axes(ndim, false); for (int i = 0; i < dims.size(); i++) { - int tmp_dim = dims[i] >= 0 ? dims[i] : ndim + dims[i]; + auto tmp_dim = dims[i] >= 0 ? dims[i] : ndim + dims[i]; axes[tmp_dim] = true; } diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h index 6e413ef73098dd..9a6ae0b4fec68d 100644 --- a/paddle/phi/kernels/strings/gpu/copy_utils.h +++ b/paddle/phi/kernels/strings/gpu/copy_utils.h @@ -136,7 +136,7 @@ void DeserializeOnCPU(const Context& dev_ctx, StringTensor* dst) { auto* strings_data = reinterpret_cast(src.data()); auto* strings_offset = reinterpret_cast(strings_data); - int numel = strings_offset[0] / sizeof(int) - 1; + auto numel = strings_offset[0] / sizeof(int) - 1; dst->Resize(common::make_ddim({numel})); dtype::pstring* dst_str = dev_ctx.template HostAlloc(dst); for (int i = 0; i < numel; ++i) { From 56e78de1dbf3184260bb2eb385aa79a2e70b93ae Mon Sep 17 00:00:00 2001 From: zrr1999 <2742392377@qq.com> Date: Fri, 7 Nov 2025 09:09:57 +0000 Subject: [PATCH 2/2] fix --- paddle/phi/kernels/strings/gpu/copy_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h index 9a6ae0b4fec68d..66f0a67df075c4 100644 --- a/paddle/phi/kernels/strings/gpu/copy_utils.h +++ b/paddle/phi/kernels/strings/gpu/copy_utils.h @@ -136,7 +136,7 @@ void DeserializeOnCPU(const Context& dev_ctx, StringTensor* dst) { auto* strings_data = reinterpret_cast(src.data()); auto* strings_offset = reinterpret_cast(strings_data); - auto numel = strings_offset[0] / sizeof(int) - 1; + int64_t numel = strings_offset[0] / sizeof(int) - 1; dst->Resize(common::make_ddim({numel})); dtype::pstring* dst_str = dev_ctx.template HostAlloc(dst); for (int i = 0; i < numel; ++i) {