11import logging
22import os
33import socket
4+ import time
45from contextlib import contextmanager
56from typing import Optional
67
78import psutil
89
9- # import torch.distributed as dist
10-
1110
1211@contextmanager
1312def temp_env (env_dict ):
@@ -25,33 +24,35 @@ def temp_env(env_dict):
2524 os .environ [k ] = v
2625
2726
28- # @contextmanager
29- # def dist_group(backend="gloo"):
30- # if dist.is_initialized():
31- # yield
32- # return
27+ def dns_resolver (retries : int = 20 , base_delay : float = 0.5 ):
28+ # We should resolve DNS with retries to avoid transient network issues.
29+ # When the pod is just started, DNS resolution may fail.
30+ def resolve (dns : str ):
31+ delay = base_delay
32+ for attempt in range (retries ):
33+ try :
34+ return socket .gethostbyname (dns )
35+ except socket .gaierror :
36+ if attempt == retries - 1 :
37+ raise
38+ time .sleep (delay )
39+ delay = min (delay * 1.5 , 5 )
3340
34- # dist.init_process_group(backend=backend)
35- # try:
36- # yield
37- # finally:
38- # dist.destroy_process_group()
41+ return resolve
3942
4043
41- def get_cluster_ips (word_size : int = 2 ) -> list [str ]:
42- """
43- Returns the IP addresses of all nodes in the cluster.
44- 0: leader
45- 1~N-1: workers
46- """
44+ def get_cluster_dns_list (word_size : int ) -> list [str ]:
4745 leader_dns = os .getenv ("LWS_LEADER_ADDRESS" )
4846 if not leader_dns :
4947 raise RuntimeError ("LWS_LEADER_ADDRESS is not set" )
50- cluster_dns = [leader_dns ]
51- for i in range (1 , word_size ):
52- cur_dns = f"vllm-0-{ i } .vllm.vllm-project"
53- cluster_dns .append (cur_dns )
54- return [socket .gethostbyname (dns ) for dns in cluster_dns ]
48+
49+ workers = [f"vllm-0-{ i } .vllm.vllm-project" for i in range (1 , word_size )]
50+ return [leader_dns ] + workers
51+
52+
53+ def get_cluster_ips (word_size : int = 2 ) -> list [str ]:
54+ resolver = dns_resolver ()
55+ return [resolver (dns ) for dns in get_cluster_dns_list (word_size )]
5556
5657
5758def get_avaliable_port (start_port : int = 6000 , end_port : int = 7000 ) -> int :
@@ -66,9 +67,29 @@ def get_avaliable_port(start_port: int = 6000, end_port: int = 7000) -> int:
6667 raise RuntimeError ("No available port found" )
6768
6869
69- def get_cur_ip () -> str :
70- """Returns the current machine's IP address."""
71- return socket .gethostbyname_ex (socket .gethostname ())[2 ][0 ]
70+ def get_cur_ip (retries : int = 20 , base_delay : float = 0.5 ):
71+ """
72+ Returns the pod/machine's primary IP address with retry.
73+ This is necessary because network interfaces may not be ready
74+ immediately after container startup.
75+ """
76+ delay = base_delay
77+
78+ for attempt in range (retries ):
79+ try :
80+ # Best method: UDP trick (doesn't actually send packets)
81+ with socket .socket (socket .AF_INET , socket .SOCK_DGRAM ) as s :
82+ s .connect (("8.8.8.8" , 80 ))
83+ return s .getsockname ()[0 ]
84+ except Exception :
85+ # fallback: hostname resolution
86+ try :
87+ return socket .gethostbyname (socket .gethostname ())
88+ except Exception :
89+ if attempt == retries - 1 :
90+ raise RuntimeError ("Failed to determine local IP address" )
91+ time .sleep (delay )
92+ delay = min (delay * 1.5 , 5 )
7293
7394
7495def get_net_interface (ip : Optional [str ] = None ) -> Optional [str ]:
0 commit comments