Skip to content

Commit 91b6ba8

Browse files
authored
[CI] Fix kubernetes failed to resolve ip by dns name (#4240)
### What this PR does / why we need it? While in the scenario where the pod has been started, but the corresponding DNS service is not yet ready. If we immediately resolve the DNS domain name at this time, an error will occur. see https://github.com/vllm-project/vllm-ascend/actions/runs/19436639688/job/55609108796 - vLLM version: v0.11.0 - vLLM main: vllm-project/vllm@2918c1b --------- Signed-off-by: wangli <wangli858794774@gmail.com>
1 parent df777e9 commit 91b6ba8

File tree

1 file changed

+47
-26
lines changed
  • tests/e2e/nightly/multi_node/config

1 file changed

+47
-26
lines changed

tests/e2e/nightly/multi_node/config/utils.py

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
import logging
22
import os
33
import socket
4+
import time
45
from contextlib import contextmanager
56
from typing import Optional
67

78
import psutil
89

9-
# import torch.distributed as dist
10-
1110

1211
@contextmanager
1312
def temp_env(env_dict):
@@ -25,33 +24,35 @@ def temp_env(env_dict):
2524
os.environ[k] = v
2625

2726

28-
# @contextmanager
29-
# def dist_group(backend="gloo"):
30-
# if dist.is_initialized():
31-
# yield
32-
# return
27+
def dns_resolver(retries: int = 20, base_delay: float = 0.5):
28+
# We should resolve DNS with retries to avoid transient network issues.
29+
# When the pod is just started, DNS resolution may fail.
30+
def resolve(dns: str):
31+
delay = base_delay
32+
for attempt in range(retries):
33+
try:
34+
return socket.gethostbyname(dns)
35+
except socket.gaierror:
36+
if attempt == retries - 1:
37+
raise
38+
time.sleep(delay)
39+
delay = min(delay * 1.5, 5)
3340

34-
# dist.init_process_group(backend=backend)
35-
# try:
36-
# yield
37-
# finally:
38-
# dist.destroy_process_group()
41+
return resolve
3942

4043

41-
def get_cluster_ips(word_size: int = 2) -> list[str]:
42-
"""
43-
Returns the IP addresses of all nodes in the cluster.
44-
0: leader
45-
1~N-1: workers
46-
"""
44+
def get_cluster_dns_list(word_size: int) -> list[str]:
4745
leader_dns = os.getenv("LWS_LEADER_ADDRESS")
4846
if not leader_dns:
4947
raise RuntimeError("LWS_LEADER_ADDRESS is not set")
50-
cluster_dns = [leader_dns]
51-
for i in range(1, word_size):
52-
cur_dns = f"vllm-0-{i}.vllm.vllm-project"
53-
cluster_dns.append(cur_dns)
54-
return [socket.gethostbyname(dns) for dns in cluster_dns]
48+
49+
workers = [f"vllm-0-{i}.vllm.vllm-project" for i in range(1, word_size)]
50+
return [leader_dns] + workers
51+
52+
53+
def get_cluster_ips(word_size: int = 2) -> list[str]:
54+
resolver = dns_resolver()
55+
return [resolver(dns) for dns in get_cluster_dns_list(word_size)]
5556

5657

5758
def get_avaliable_port(start_port: int = 6000, end_port: int = 7000) -> int:
@@ -66,9 +67,29 @@ def get_avaliable_port(start_port: int = 6000, end_port: int = 7000) -> int:
6667
raise RuntimeError("No available port found")
6768

6869

69-
def get_cur_ip() -> str:
70-
"""Returns the current machine's IP address."""
71-
return socket.gethostbyname_ex(socket.gethostname())[2][0]
70+
def get_cur_ip(retries: int = 20, base_delay: float = 0.5):
71+
"""
72+
Returns the pod/machine's primary IP address with retry.
73+
This is necessary because network interfaces may not be ready
74+
immediately after container startup.
75+
"""
76+
delay = base_delay
77+
78+
for attempt in range(retries):
79+
try:
80+
# Best method: UDP trick (doesn't actually send packets)
81+
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
82+
s.connect(("8.8.8.8", 80))
83+
return s.getsockname()[0]
84+
except Exception:
85+
# fallback: hostname resolution
86+
try:
87+
return socket.gethostbyname(socket.gethostname())
88+
except Exception:
89+
if attempt == retries - 1:
90+
raise RuntimeError("Failed to determine local IP address")
91+
time.sleep(delay)
92+
delay = min(delay * 1.5, 5)
7293

7394

7495
def get_net_interface(ip: Optional[str] = None) -> Optional[str]:

0 commit comments

Comments
 (0)