flashinfer-ai
diff --git a/‎benchmarks/bench_logging_overhead.py‎
Lines changed: 2 additions & 19 deletions b/‎benchmarks/bench_logging_overhead.py‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎flashinfer/api_logging.py‎
Lines changed: 3 additions & 66 deletions b/‎flashinfer/api_logging.py‎
Lines changed: 3 additions & 66 deletions
diff --git a/‎flashinfer/cudnn/decode.py‎
Lines changed: 2 additions & 2 deletions b/‎flashinfer/cudnn/decode.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎flashinfer/cudnn/prefill.py‎
Lines changed: 2 additions & 2 deletions b/‎flashinfer/cudnn/prefill.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎flashinfer/decode.py‎
Lines changed: 10 additions & 10 deletions b/‎flashinfer/decode.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎flashinfer/fused_moe/core.py‎
Lines changed: 7 additions & 7 deletions b/‎flashinfer/fused_moe/core.py‎
Lines changed: 7 additions & 7 deletions
@@ -5,11 +5,6 @@
 This script creates decorated and undecorated versions of a test function
 (torch.matmul) and compares their performance to accurately measure logging overhead.
 
-Why torch.matmul instead of bmm_fp8?
-  - bmm_fp8 is already decorated in the FlashInfer source code
-  - Using it would cause double-decoration and inaccurate results
-  - torch.matmul gives us a clean baseline to measure pure decorator overhead
-
 Usage:
     # Set the logging level before running
     export FLASHINFER_LOGLEVEL_DBG=3
@@ -37,30 +32,18 @@
 LOG_DEST = os.environ.get("FLASHINFER_LOGDEST_DBG", "/tmp/flashinfer_benchmark_log.txt")
 
 # Import the decorator
-try:
-    from flashinfer.api_logging import flashinfer_api_log
-except ImportError as e:
-    print(f"Error: Could not import flashinfer: {e}")
-    print("Make sure flashinfer is installed.")
-    exit(1)
+from flashinfer.api_logging import flashinfer_log
 
 
 # Create two versions of a test function:
 # 1. Undecorated (baseline)
 # 2. Decorated (with logging)
-#
-# We use a simple torch.matmul instead of bmm_fp8 because bmm_fp8 is already
-# decorated in the source code, which would cause double-decoration.
-
-
 def test_matmul_undecorated(A, B):
-    """Undecorated version - baseline for comparison."""
     return torch.matmul(A, B)
 
 
-@flashinfer_api_log
+@flashinfer_log
 def test_matmul_decorated(A, B):
-    """Decorated version - with API logging."""
     return torch.matmul(A, B)
 
 
 
@@ -32,8 +32,6 @@ def _substitute_process_id(path: str) -> str:
 
     This is useful for multi-process/multi-GPU environments where each process
     needs its own log file.
-
-    Example: "flashinfer_log_%i.txt" -> "flashinfer_log_12345.txt"
     """
     if "%i" in path:
         return path.replace("%i", str(os.getpid()))
@@ -46,38 +44,6 @@ def _substitute_process_id(path: str) -> str:
     os.environ.get("FLASHINFER_LOGDEST_DBG", "stdout")
 )
 
-# Enable cuDNN, cuBLAS, and cuBLASLt API logging when FlashInfer logging level >= 5
-# Only override if the user hasn't already configured the logging switch
-# If the switch is not set, we override both the switch and destination as a bundle
-if _API_LOG_LEVEL >= 5:
-    # cuBLAS logging: Check switch, set both switch and destination
-    if "CUBLAS_LOGINFO_DBG" not in os.environ:
-        os.environ["CUBLAS_LOGINFO_DBG"] = "1"
-        os.environ["CUBLAS_LOGDEST_DBG"] = _substitute_process_id(
-            "flashinfer_cublas_log_%i.txt"
-        )
-
-    # cuBLASLt logging: Check switch, set both switch and destination
-    if "CUBLASLT_LOG_LEVEL" not in os.environ:
-        os.environ["CUBLASLT_LOG_LEVEL"] = "2"
-        os.environ["CUBLASLT_LOG_FILE"] = _substitute_process_id(
-            "flashinfer_cublaslt_log_%i.txt"
-        )
-
-    # cuDNN backend logging: Check switch, set both switch and destination
-    if "CUDNN_LOGLEVEL_DBG" not in os.environ:
-        os.environ["CUDNN_LOGLEVEL_DBG"] = "2.5"
-        os.environ["CUDNN_LOGDEST_DBG"] = _substitute_process_id(
-            "flashinfer_cudnn_backend_log_%i.txt"
-        )
-
-    # cuDNN frontend logging: Check switch, set both switch and destination
-    if "CUDNN_FRONTEND_LOG_INFO" not in os.environ:
-        os.environ["CUDNN_FRONTEND_LOG_INFO"] = "1"
-        os.environ["CUDNN_FRONTEND_LOG_FILE"] = _substitute_process_id(
-            "flashinfer_cudnn_frontend_log_%i.txt"
-        )
-
 # Create logger using Python's logging library
 _logger = logging.getLogger("flashinfer.api")
 
@@ -185,28 +151,6 @@ def _log_system_info():
         # PyTorch version
         lines.append(f"PyTorch version: {torch.__version__}")
 
-        # cuDNN/cuBLAS/cuBLASLt logging status
-        if _API_LOG_LEVEL >= 5:
-            lines.append("")
-            lines.append("cuDNN/cuBLAS/cuBLASLt Logging: Enabled (Level 5)")
-            cublas_info = os.environ.get("CUBLAS_LOGINFO_DBG", "not set")
-            cublas_dest = os.environ.get("CUBLAS_LOGDEST_DBG", "not set")
-            cublaslt_level = os.environ.get("CUBLASLT_LOG_LEVEL", "not set")
-            cublaslt_file = os.environ.get("CUBLASLT_LOG_FILE", "not set")
-            cudnn_level = os.environ.get("CUDNN_LOGLEVEL_DBG", "not set")
-            cudnn_dest = os.environ.get("CUDNN_LOGDEST_DBG", "not set")
-            cudnn_fe_info = os.environ.get("CUDNN_FRONTEND_LOG_INFO", "not set")
-            cudnn_fe_file = os.environ.get("CUDNN_FRONTEND_LOG_FILE", "not set")
-
-            lines.append(f"  CUBLAS_LOGINFO_DBG={cublas_info}")
-            lines.append(f"  CUBLAS_LOGDEST_DBG={cublas_dest}")
-            lines.append(f"  CUBLASLT_LOG_LEVEL={cublaslt_level}")
-            lines.append(f"  CUBLASLT_LOG_FILE={cublaslt_file}")
-            lines.append(f"  CUDNN_LOGLEVEL_DBG={cudnn_level}")
-            lines.append(f"  CUDNN_LOGDEST_DBG={cudnn_dest}")
-            lines.append(f"  CUDNN_FRONTEND_LOG_INFO={cudnn_fe_info}")
-            lines.append(f"  CUDNN_FRONTEND_LOG_FILE={cudnn_fe_file}")
-
     except Exception as e:
         lines.append(f"Error gathering system information: {e}")
 
@@ -519,7 +463,7 @@ def _log_function_outputs(func_name: str, result: Any, level: int) -> None:
     _logger.debug("\n".join(lines))
 
 
-def flashinfer_api_log(func: Callable = None) -> Callable:
+def flashinfer_log(func: Callable = None) -> Callable:
     """
     Decorator to log FlashInfer API calls using Python's logging library.
 
@@ -544,7 +488,7 @@ def flashinfer_api_log(func: Callable = None) -> Callable:
     --------
     Basic usage:
 
-    >>> @flashinfer_api_log
+    >>> @flashinfer_log
     ... def my_function(x, y):
     ...     return x + y
 
@@ -563,13 +507,7 @@ def flashinfer_api_log(func: Callable = None) -> Callable:
     - **CUDA Graph Compatibility**: At level 5, tensor statistics (min/max/mean/nan_count)
       are automatically skipped during CUDA graph capture to avoid synchronization issues.
       The message "[statistics skipped: CUDA graph capture in progress]" will be logged.
-    - **cuDNN/cuBLAS/cuBLASLt Integration**: At level 5, if not already set by the user, the following
-      environment variables are automatically configured to enable cuDNN, cuBLAS, and cuBLASLt logging:
-      - CUBLAS_LOGINFO_DBG=1, CUBLAS_LOGDEST_DBG=flashinfer_cublas_log_%i.txt
-      - CUBLASLT_LOG_LEVEL=2, CUBLASLT_LOG_FILE=flashinfer_cublaslt_log_%i.txt
-      - CUDNN_LOGLEVEL_DBG=2.5, CUDNN_LOGDEST_DBG=flashinfer_cudnn_backend_log_%i.txt
-      - CUDNN_FRONTEND_LOG_INFO=1, CUDNN_FRONTEND_LOG_FILE=flashinfer_cudnn_frontend_log_%i.txt
-      The %i pattern is automatically replaced with the process ID for multi-process environments.
+    - The %i pattern is automatically replaced with the process ID for multi-process environments.
     - The logger does not propagate to the root logger to avoid duplicate logs.
     """
     # If logging is disabled, return original function with zero overhead
@@ -621,7 +559,6 @@ def wrapper(*args, **kwargs):
 
         return wrapper
 
-    # Support both @flashinfer_api_log and @flashinfer_api_log()
     if func is None:
         return decorator
     return decorator(func)
@@ -3,7 +3,7 @@
 
 import torch
 
-from ..api_logging import flashinfer_api_log
+from ..api_logging import flashinfer_log
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -253,7 +253,7 @@ def _batch_decode_with_kv_cache(
     return out
 
 
-@flashinfer_api_log
+@flashinfer_log
 def cudnn_batch_decode_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
 
@@ -3,7 +3,7 @@
 
 import torch
 
-from ..api_logging import flashinfer_api_log
+from ..api_logging import flashinfer_log
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -384,7 +384,7 @@ def _batch_prefill_with_kv_cache(
         return out, None
 
 
-@flashinfer_api_log
+@flashinfer_log
 def cudnn_batch_prefill_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
 
@@ -21,7 +21,7 @@
 
 import torch
 
-from .api_logging import flashinfer_api_log
+from .api_logging import flashinfer_log
 from .xqa import xqa, xqa_mla
 from .cudnn import cudnn_batch_decode_with_kv_cache as cudnn_batch_decode_with_kv_cache
 from .jit import (
@@ -313,7 +313,7 @@ def get_trtllm_gen_fmha_module():
     return op
 
 
-@flashinfer_api_log
+@flashinfer_log
 def single_decode_with_kv_cache_with_jit_module(
     jit_module: Any,
     q: torch.Tensor,
@@ -390,7 +390,7 @@ def single_decode_with_kv_cache(
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
-@flashinfer_api_log
+@flashinfer_log
 def single_decode_with_kv_cache(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -649,7 +649,7 @@ class BatchDecodeWithPagedKVCacheWrapper:
     manages the lifecycle of these data structures.
     """
 
-    @flashinfer_api_log
+    @flashinfer_log
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -813,7 +813,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
-    @flashinfer_api_log
+    @flashinfer_log
     def plan(
         self,
         indptr: torch.Tensor,
@@ -1167,7 +1167,7 @@ def run(
         window_left: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
-    @flashinfer_api_log
+    @flashinfer_log
     def run(
         self,
         q: torch.Tensor,
@@ -2065,7 +2065,7 @@ def _fake_paged_run(
     )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_batch_decode_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -2339,7 +2339,7 @@ def trtllm_batch_decode_with_kv_cache(
 
 
 # xqa uses NHD layout
-@flashinfer_api_log
+@flashinfer_log
 def xqa_batch_decode_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -2524,7 +2524,7 @@ def _check_trtllm_gen_mla_shape(
         )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_batch_decode_with_kv_cache_mla(
     query: torch.Tensor,
     kv_cache: torch.Tensor,
@@ -2686,7 +2686,7 @@ def trtllm_batch_decode_with_kv_cache_mla(
         raise ValueError(f"Backend {backend} not supported")
 
 
-@flashinfer_api_log
+@flashinfer_log
 def xqa_batch_decode_with_kv_cache_mla(
     query: torch.Tensor,
     kv_cache: torch.Tensor,
 
@@ -20,7 +20,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 
-from ..api_logging import flashinfer_api_log
+from ..api_logging import flashinfer_log
 from ..autotuner import (
     AutoTuner,
     DynamicTensorSpec,
@@ -686,7 +686,7 @@ def _fake_cutlass_fused_moe(
 
 
 # ref: https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py#L121
-@flashinfer_api_log
+@flashinfer_log
 def cutlass_fused_moe(
     input: torch.Tensor,
     token_selected_experts: torch.Tensor,
@@ -1859,7 +1859,7 @@ def _fake_trtllm_fp4_block_scale_moe(
     )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_bf16_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -1940,7 +1940,7 @@ def trtllm_bf16_moe(
     )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_fp8_per_tensor_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2014,7 +2014,7 @@ def trtllm_fp8_per_tensor_scale_moe(
     )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_fp8_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2092,7 +2092,7 @@ def trtllm_fp8_block_scale_moe(
     )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_fp4_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2222,7 +2222,7 @@ def trtllm_fp4_block_scale_moe(
     )
 
 
-@flashinfer_api_log
+@flashinfer_log
 def trtllm_fp4_block_scale_routed_moe(
     topk_ids: torch.Tensor,
     routing_bias: Optional[torch.Tensor],