Skip to content

Commit c216119

Browse files
[Core] GC Debug callback (vllm-project#24829)
Signed-off-by: Jialin Ouyang <jialino@meta.com> Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com> Co-authored-by: Jialin Ouyang <jialino@meta.com>
1 parent 5546acb commit c216119

File tree

5 files changed

+217
-0
lines changed

5 files changed

+217
-0
lines changed

docs/contributing/profiling.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,11 @@ One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
208208
pip install snakeviz
209209
snakeviz expensive_function.prof
210210
```
211+
212+
### Analyzing Garbage Collection Costs
213+
214+
Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
215+
216+
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
217+
- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
218+
collected objects for each gc.collect

tests/utils_/test_gc_utils.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
from dataclasses import dataclass
4+
from typing import Any
5+
6+
from vllm.utils.gc_utils import (GCDebugConfig, _compute_detailed_type,
7+
_compute_top_gc_collected_objects)
8+
9+
10+
@dataclass
11+
class Normal:
12+
v: int
13+
14+
15+
@dataclass
16+
class ListWrapper:
17+
vs: list[int]
18+
19+
def __len__(self) -> int:
20+
return len(self.vs)
21+
22+
23+
def test_compute_detailed_type():
24+
assert _compute_detailed_type(
25+
Normal(v=8)) == "<class 'tests.utils_.test_gc_utils.Normal'>"
26+
27+
assert _compute_detailed_type([1, 2, 3]) == "<class 'list'>(size:3)"
28+
assert _compute_detailed_type({4, 5}) == "<class 'set'>(size:2)"
29+
assert _compute_detailed_type({6: 7}) == "<class 'dict'>(size:1)"
30+
assert _compute_detailed_type(ListWrapper(
31+
vs=[])) == "<class 'tests.utils_.test_gc_utils.ListWrapper'>(size:0)"
32+
33+
34+
def test_compute_top_gc_collected_objects():
35+
objects: list[Any] = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12],
36+
{13, 14}, {
37+
15: 16,
38+
17: 18
39+
},
40+
Normal(v=19),
41+
Normal(v=20),
42+
Normal(v=21)]
43+
assert _compute_top_gc_collected_objects(objects, top=-1) == ""
44+
assert _compute_top_gc_collected_objects(objects, top=0) == ""
45+
assert _compute_top_gc_collected_objects(
46+
objects, top=1) == " 4:<class 'list'>(size:3)"
47+
assert _compute_top_gc_collected_objects(objects, top=2) == "\n".join([
48+
" 4:<class 'list'>(size:3)",
49+
" 3:<class 'tests.utils_.test_gc_utils.Normal'>"
50+
])
51+
assert _compute_top_gc_collected_objects(objects, top=3) == "\n".join([
52+
" 4:<class 'list'>(size:3)",
53+
" 3:<class 'tests.utils_.test_gc_utils.Normal'>",
54+
" 1:<class 'set'>(size:2)"
55+
])
56+
57+
58+
def test_gc_debug_config():
59+
assert not GCDebugConfig(None).enabled
60+
assert not GCDebugConfig("").enabled
61+
assert not GCDebugConfig("0").enabled
62+
63+
config = GCDebugConfig("1")
64+
assert config.enabled
65+
assert config.top_objects == -1
66+
67+
config = GCDebugConfig("{\"top_objects\":5}")
68+
assert config.enabled
69+
assert config.top_objects == 5

vllm/envs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@
205205
VLLM_USE_NCCL_SYMM_MEM: bool = False
206206
VLLM_NCCL_INCLUDE_PATH: Optional[str] = None
207207
VLLM_USE_FBGEMM: bool = False
208+
VLLM_GC_DEBUG: str = ""
208209

209210

210211
def get_default_cache_root():
@@ -1475,6 +1476,13 @@ def get_vllm_port() -> Optional[int]:
14751476
lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None),
14761477
# Flag to enable FBGemm kernels on model execution
14771478
"VLLM_USE_FBGEMM": lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))),
1479+
1480+
# GC debug config
1481+
# - VLLM_GC_DEBUG=0: disable GC debugger
1482+
# - VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
1483+
# - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger with
1484+
# top 5 collected objects
1485+
"VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""),
14781486
}
14791487

14801488
# --8<-- [end:env-vars-definition]

vllm/utils/gc_utils.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import gc
4+
import json
5+
import time
6+
from collections import Counter
7+
from contextlib import suppress
8+
from typing import Any, Optional
9+
10+
from vllm.envs import VLLM_GC_DEBUG
11+
from vllm.logger import init_logger
12+
13+
logger = init_logger(__name__)
14+
15+
16+
class GCDebugConfig:
17+
"""
18+
Config for GC Debugger.
19+
- 0: disable GC debugger
20+
- 1: enable GC debugger with gc.collect elpased times
21+
- '{"top_objects":5}': enable GC debugger with top 5 collected objects
22+
"""
23+
24+
def __init__(self, gc_debug_conf: Optional[str] = None) -> None:
25+
self.enabled: bool = False
26+
self.top_objects: int = -1
27+
28+
if not gc_debug_conf or gc_debug_conf == "0":
29+
pass
30+
elif gc_debug_conf == "1":
31+
self.enabled = True
32+
else:
33+
try:
34+
json_conf = json.loads(gc_debug_conf)
35+
self.enabled = True
36+
self.top_objects = json_conf.get("top_objects", -1)
37+
except Exception:
38+
self.enabled = False
39+
logger.error("Failed to parse VLLM_GC_DEBUG(%s)",
40+
VLLM_GC_DEBUG)
41+
logger.info("GC Debug Config. %s", str(self))
42+
43+
def __repr__(self) -> str:
44+
return f"enabled:{self.enabled},top_objects:{self.top_objects}"
45+
46+
47+
class GCDebugger:
48+
"""
49+
Debugger for GC which logs helpful information for GC understanding.
50+
To enable, you should call maybe_attach_gc_debug_callback in the process.
51+
"""
52+
53+
def __init__(self, config: GCDebugConfig) -> None:
54+
self.config = config
55+
# Start time in micro second of this GC cycle
56+
self.start_time_ns: int = time.monotonic_ns()
57+
# If config.top_objects is positive,
58+
# compute top collected objects by object types
59+
self.gc_top_collected_objects: str = ""
60+
61+
def handle(self, phase: str, info: dict[str, int]) -> None:
62+
"""
63+
Handles a GC event (e.g. GC start or GC finish)
64+
"""
65+
generation = info.get("generation")
66+
if generation is None:
67+
return
68+
if phase == "start":
69+
# Before GC started, record GC start time
70+
# and top collected objects
71+
self.start_time_ns = time.monotonic_ns()
72+
self.gc_top_collected_objects = _compute_top_gc_collected_objects(
73+
gc.get_objects(generation), self.config.top_objects)
74+
elif phase == "stop":
75+
# After GC finished, Record GC elapsed time and
76+
# optionally top collected objects
77+
elpased_ms = (time.monotonic_ns() - self.start_time_ns) / 1e6
78+
logger.info(
79+
"GC took %.3fms to complete. "
80+
"Collected %s objects in GC generation %d.%s",
81+
elpased_ms,
82+
str(info.get("collected", "?")),
83+
generation,
84+
(f" Top collected objects: \n{self.gc_top_collected_objects}"
85+
if self.gc_top_collected_objects else ""),
86+
)
87+
88+
89+
def maybe_attach_gc_debug_callback() -> None:
90+
"""
91+
Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
92+
"""
93+
config = GCDebugConfig(VLLM_GC_DEBUG)
94+
if config.enabled:
95+
debugger: GCDebugger = GCDebugger(config)
96+
97+
def gc_callback(phase: str, info: dict[str, int]) -> None:
98+
debugger.handle(phase, info)
99+
100+
gc.callbacks.append(gc_callback)
101+
102+
103+
def _compute_detailed_type(o: Any) -> str:
104+
"""
105+
Detailed object type.
106+
107+
TODO(Jialin): Further enhance the detailed type with element types for
108+
easier debugging. We tried but occasionally it would run into signals
109+
which kills the engine.
110+
"""
111+
size_str: str = ""
112+
# Object doesn't support len() - this can happen with type objects
113+
# or other objects that don't implement __len__ properly
114+
with suppress(Exception):
115+
size_str = f"(size:{len(o)})"
116+
return f"{str(type(o))}{size_str}"
117+
118+
119+
def _compute_top_gc_collected_objects(objects: list[Any], top: int) -> str:
120+
"""
121+
Group collected objects by types.
122+
"""
123+
if top <= 0:
124+
return ""
125+
object_types = [_compute_detailed_type(o) for o in objects]
126+
return "\n".join(
127+
f"{count:>5}:{object_type}"
128+
for object_type, count in Counter(object_types).most_common(top))

vllm/v1/engine/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
maybe_register_config_serialize_by_value)
3030
from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
3131
resolve_obj_by_qualname, set_process_title)
32+
from vllm.utils.gc_utils import maybe_attach_gc_debug_callback
3233
from vllm.v1.core.kv_cache_utils import (BlockHash,
3334
generate_scheduler_kv_cache_config,
3435
get_kv_cache_configs,
@@ -532,6 +533,9 @@ def __init__(
532533
gc.collect()
533534
gc.freeze()
534535

536+
# If enable, attach GC debugger after static variable freeze.
537+
maybe_attach_gc_debug_callback()
538+
535539
@contextmanager
536540
def _perform_handshakes(
537541
self,

0 commit comments

Comments
 (0)