Rename decorator and environment flags

bkryu · bkryu · commit 41ad5581f357 · 2025-11-21T21:59:09.000Z
diff --git a/LOGGING.md b/LOGGING.md
@@ -8,10 +8,10 @@ Enable logging using two environment variables:
 
 ```bash
 # Set logging level (0-5)
-export FLASHINFER_LOGLEVEL_DBG=3
+export FLASHINFER_LOGLEVEL=3
 
 # Set log destination (default is stdout)
-export FLASHINFER_LOGDEST_DBG=stdout  # or stderr, or a file path like "flashinfer.log"
+export FLASHINFER_LOGDEST=stdout  # or stderr, or a file path like "flashinfer.log"
 
 # Run your code
 python train.py
@@ -33,19 +33,19 @@ python train.py
 
 | Variable | Type | Default | Description |
 |----------|------|---------|-------------|
-| `FLASHINFER_LOGLEVEL_DBG` | int | 0 | Logging level (0, 1, 3, 5) |
-| `FLASHINFER_LOGDEST_DBG` | str | `stdout` | Log destination: `stdout`, `stderr`, or file path |
+| `FLASHINFER_LOGLEVEL` | int | 0 | Logging level (0, 1, 3, 5) |
+| `FLASHINFER_LOGDEST` | str | `stdout` | Log destination: `stdout`, `stderr`, or file path |
 
 ### Process ID Substitution
 
 Use `%i` in file paths for automatic process ID substitution (useful for multi-GPU training):
 
 ```bash
-export FLASHINFER_LOGDEST_DBG="flashinfer_log_%i.txt"  # → flashinfer_log_12345.txt
+export FLASHINFER_LOGDEST="flashinfer_log_%i.txt"  # → flashinfer_log_12345.txt
 ```
 
 This works for:
-- `FLASHINFER_LOGDEST_DBG`
+- `FLASHINFER_LOGDEST`
 
 ## Miscellaneous Notes and Examples
 ### CUDA Graph Compatibility
@@ -65,8 +65,8 @@ Output shows: `[statistics skipped: CUDA graph capture in progress]`
 
 ```bash
 # Use %i for process ID substitution
-export FLASHINFER_LOGLEVEL_DBG=3
-export FLASHINFER_LOGDEST_DBG="logs/flashinfer_api_%i.log"
+export FLASHINFER_LOGLEVEL=3
+export FLASHINFER_LOGDEST="logs/flashinfer_api_%i.log"
 
 torchrun --nproc_per_node=8 awesome_script_that_uses_FlashInfer.py
 
diff --git a/README.md b/README.md
@@ -175,10 +175,10 @@ FlashInfer provides comprehensive API logging for debugging. Enable it using env
 
 ```bash
 # Enable logging (levels: 0=off (default), 1=basic, 3=detailed, 5=statistics)
-export FLASHINFER_LOGLEVEL_DBG=3
+export FLASHINFER_LOGLEVEL=3
 
 # Set log destination (stdout (default), stderr, or file path)
-export FLASHINFER_LOGDEST_DBG=stdout
+export FLASHINFER_LOGDEST=stdout
 ```
 
 For detailed information about logging levels, configuration, and advanced features, see [LOGGING.md](LOGGING.md).
diff --git a/benchmarks/bench_logging_overhead.py b/benchmarks/bench_logging_overhead.py
@@ -7,14 +7,14 @@
 
 Usage:
     # Set the logging level before running
-    export FLASHINFER_LOGLEVEL_DBG=3
+    export FLASHINFER_APILEVEL=3
     python bench_logging_overhead.py
 
     # Or run with different levels
-    FLASHINFER_LOGLEVEL_DBG=0 python bench_logging_overhead.py
-    FLASHINFER_LOGLEVEL_DBG=1 python bench_logging_overhead.py
-    FLASHINFER_LOGLEVEL_DBG=3 python bench_logging_overhead.py
-    FLASHINFER_LOGLEVEL_DBG=5 python bench_logging_overhead.py
+    FLASHINFER_APILEVEL=0 python bench_logging_overhead.py
+    FLASHINFER_APILEVEL=1 python bench_logging_overhead.py
+    FLASHINFER_APILEVEL=3 python bench_logging_overhead.py
+    FLASHINFER_APILEVEL=5 python bench_logging_overhead.py
 
     # Or use the helper script to run all levels
     bash benchmark_all_levels.sh
@@ -28,11 +28,11 @@
 from typing import List, Tuple
 
 # Get logging level BEFORE importing flashinfer
-LOGGING_LEVEL = int(os.environ.get("FLASHINFER_LOGLEVEL_DBG", "0"))
-LOG_DEST = os.environ.get("FLASHINFER_LOGDEST_DBG", "/tmp/flashinfer_benchmark_log.txt")
+LOGGING_LEVEL = int(os.environ.get("FLASHINFER_APILEVEL", "0"))
+LOG_DEST = os.environ.get("FLASHINFER_APIDEST", "/tmp/flashinfer_benchmark_log.txt")
 
 # Import the decorator
-from flashinfer.api_logging import flashinfer_log
+from flashinfer.api_logging import flashinfer_api
 
 
 # Create two versions of a test function:
@@ -42,7 +42,7 @@ def test_matmul_undecorated(A, B):
     return torch.matmul(A, B)
 
 
-@flashinfer_log
+@flashinfer_api
 def test_matmul_decorated(A, B):
     return torch.matmul(A, B)
 
@@ -209,8 +209,8 @@ def main():
 
     # Display logging configuration
     print("\nLogging Configuration:")
-    print(f"  FLASHINFER_LOGLEVEL_DBG = {LOGGING_LEVEL}")
-    print(f"  FLASHINFER_LOGDEST_DBG = {LOG_DEST}")
+    print(f"  FLASHINFER_APILEVEL = {LOGGING_LEVEL}")
+    print(f"  FLASHINFER_APIDEST = {LOG_DEST}")
 
     # Get level name
     level_names = {
@@ -314,7 +314,7 @@ def main():
     print("\nTo benchmark other levels, run:")
     for level in [0, 1, 3, 5]:
         if level != LOGGING_LEVEL:
-            print(f"  FLASHINFER_LOGLEVEL_DBG={level} python {sys.argv[0]}")
+            print(f"  FLASHINFER_APILEVEL={level} python {sys.argv[0]}")
 
     print("\n" + "=" * 80)
     print("Benchmark complete!")
diff --git a/flashinfer/api_logging.py b/flashinfer/api_logging.py
@@ -39,10 +39,8 @@ def _substitute_process_id(path: str) -> str:
 
 
 # Read environment variables once at module load time
-_API_LOG_LEVEL = int(os.environ.get("FLASHINFER_LOGLEVEL_DBG", "0"))
-_API_LOG_DEST = _substitute_process_id(
-    os.environ.get("FLASHINFER_LOGDEST_DBG", "stdout")
-)
+_API_LOG_LEVEL = int(os.environ.get("FLASHINFER_APILEVEL", "0"))
+_API_LOG_DEST = _substitute_process_id(os.environ.get("FLASHINFER_APIDEST", "stdout"))
 
 # Create logger using Python's logging library
 _logger = logging.getLogger("flashinfer.api")
@@ -56,7 +54,7 @@ def _setup_logger():
         _logger.setLevel(logging.CRITICAL + 1)  # Higher than any level
         return
 
-    # All enabled levels use loggging.DEBUG; verbosity is controlled by FLASHINFER_LOGLEVEL_DBG instead
+    # All enabled levels use loggging.DEBUG; verbosity is controlled by FLASHINFER_APILEVEL instead
     _logger.setLevel(logging.DEBUG)
 
     # Remove any existing handlers
@@ -463,22 +461,22 @@ def _log_function_outputs(func_name: str, result: Any, level: int) -> None:
     _logger.debug("\n".join(lines))
 
 
-def flashinfer_log(func: Callable = None) -> Callable:
+def flashinfer_api(func: Callable = None) -> Callable:
     """
     Decorator to log FlashInfer API calls using Python's logging library.
 
     This decorator integrates with Python's standard logging infrastructure while
-    maintaining zero overhead when disabled (FLASHINFER_LOGLEVEL_DBG=0).
+    maintaining zero overhead when disabled (FLASHINFER_APILEVEL=0).
 
     Environment Variables
     ---------------------
-    FLASHINFER_LOGLEVEL_DBG : int (default: 0)
+    FLASHINFER_APILEVEL : int (default: 0)
         - 0: No logging (zero overhead - decorator returns original function)
         - 1: Log function name only (logged BEFORE execution - crash-safe)
         - 3: Log function name + inputs/outputs with metadata (inputs logged BEFORE execution - crash-safe)
         - 5: Log function name + inputs/outputs with metadata + tensor statistics (inputs logged BEFORE execution - crash-safe)
 
-    FLASHINFER_LOGDEST_DBG : str (default: "stdout")
+    FLASHINFER_APIDEST : str (default: "stdout")
         - "stdout": Log to standard output
         - "stderr": Log to standard error
         - <path>: Log to specified file path
@@ -488,15 +486,15 @@ def flashinfer_log(func: Callable = None) -> Callable:
     --------
     Basic usage:
 
-    >>> @flashinfer_log
+    >>> @flashinfer_api
     ... def my_function(x, y):
     ...     return x + y
 
     Notes
     -----
     - Key header lines include a timestamp in the format: [YYYY-MM-DD HH:MM:SS]
       (e.g., "FlashInfer API Call: function_name", "FlashInfer API Logging - System Information")
-    - When FLASHINFER_LOGLEVEL_DBG=0, the decorator has truly zero overhead
+    - When FLASHINFER_APILEVEL=0, the decorator has truly zero overhead
       as it returns the original function unchanged.
     - Function names and inputs are logged BEFORE execution:
       - Level 1: Function name only
diff --git a/flashinfer/cudnn/decode.py b/flashinfer/cudnn/decode.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from ..api_logging import flashinfer_log
+from ..api_logging import flashinfer_api
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -253,7 +253,7 @@ def _batch_decode_with_kv_cache(
     return out
 
 
-@flashinfer_log
+@flashinfer_api
 def cudnn_batch_decode_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/flashinfer/cudnn/prefill.py b/flashinfer/cudnn/prefill.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from ..api_logging import flashinfer_log
+from ..api_logging import flashinfer_api
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -384,7 +384,7 @@ def _batch_prefill_with_kv_cache(
         return out, None
 
 
-@flashinfer_log
+@flashinfer_api
 def cudnn_batch_prefill_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -21,7 +21,7 @@
 
 import torch
 
-from .api_logging import flashinfer_log
+from .api_logging import flashinfer_api
 from .xqa import xqa, xqa_mla
 from .cudnn import cudnn_batch_decode_with_kv_cache as cudnn_batch_decode_with_kv_cache
 from .jit import (
@@ -313,7 +313,7 @@ def get_trtllm_gen_fmha_module():
     return op
 
 
-@flashinfer_log
+@flashinfer_api
 def single_decode_with_kv_cache_with_jit_module(
     jit_module: Any,
     q: torch.Tensor,
@@ -390,7 +390,7 @@ def single_decode_with_kv_cache(
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
-@flashinfer_log
+@flashinfer_api
 def single_decode_with_kv_cache(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -649,7 +649,7 @@ class BatchDecodeWithPagedKVCacheWrapper:
     manages the lifecycle of these data structures.
     """
 
-    @flashinfer_log
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -813,7 +813,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
-    @flashinfer_log
+    @flashinfer_api
     def plan(
         self,
         indptr: torch.Tensor,
@@ -1167,7 +1167,7 @@ def run(
         window_left: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
-    @flashinfer_log
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -2065,7 +2065,7 @@ def _fake_paged_run(
     )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_batch_decode_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -2339,7 +2339,7 @@ def trtllm_batch_decode_with_kv_cache(
 
 
 # xqa uses NHD layout
-@flashinfer_log
+@flashinfer_api
 def xqa_batch_decode_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -2524,7 +2524,7 @@ def _check_trtllm_gen_mla_shape(
         )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_batch_decode_with_kv_cache_mla(
     query: torch.Tensor,
     kv_cache: torch.Tensor,
@@ -2686,7 +2686,7 @@ def trtllm_batch_decode_with_kv_cache_mla(
         raise ValueError(f"Backend {backend} not supported")
 
 
-@flashinfer_log
+@flashinfer_api
 def xqa_batch_decode_with_kv_cache_mla(
     query: torch.Tensor,
     kv_cache: torch.Tensor,
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -20,7 +20,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 
-from ..api_logging import flashinfer_log
+from ..api_logging import flashinfer_api
 from ..autotuner import (
     AutoTuner,
     DynamicTensorSpec,
@@ -686,7 +686,7 @@ def _fake_cutlass_fused_moe(
 
 
 # ref: https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py#L121
-@flashinfer_log
+@flashinfer_api
 def cutlass_fused_moe(
     input: torch.Tensor,
     token_selected_experts: torch.Tensor,
@@ -1859,7 +1859,7 @@ def _fake_trtllm_fp4_block_scale_moe(
     )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_bf16_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -1940,7 +1940,7 @@ def trtllm_bf16_moe(
     )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_fp8_per_tensor_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2014,7 +2014,7 @@ def trtllm_fp8_per_tensor_scale_moe(
     )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_fp8_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2092,7 +2092,7 @@ def trtllm_fp8_block_scale_moe(
     )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_fp4_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2222,7 +2222,7 @@ def trtllm_fp4_block_scale_moe(
     )
 
 
-@flashinfer_log
+@flashinfer_api
 def trtllm_fp4_block_scale_routed_moe(
     topk_ids: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
diff --git a/flashinfer/gemm/gemm_base.py b/flashinfer/gemm/gemm_base.py
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
diff --git a/tests/utils/test_logging.py b/tests/utils/test_logging.py