66import shutil
77import subprocess
88import sys
9+ from datetime import datetime
910
1011import yaml
1112
@@ -22,6 +23,10 @@ def parse_args():
2223 '--dir' ,
2324 type = str ,
2425 help = 'Directory containing YAML configuration files' )
26+ group .add_argument ('--log-dir' ,
27+ type = str ,
28+ default = None ,
29+ help = 'Log directory' )
2530 return parser .parse_args ()
2631
2732
@@ -45,7 +50,7 @@ def calculate_nodes(world_size, num_servers, gpus_per_node):
4550 return (world_size + gpus_per_node - 1 ) // gpus_per_node * num_servers
4651
4752
48- def submit_job (config ):
53+ def submit_job (config , log_dir ):
4954 # Extract configurations
5055 slurm_config = config ['slurm' ]
5156 hw_config = config ['hardware' ]
@@ -101,25 +106,28 @@ def submit_job(config):
101106 gen_enable_attention_dp = config ['worker_config' ]['gen' ][
102107 'enable_attention_dp' ]
103108
104- # Create base log directory path
105- log_base = os .path .join (env_config ['work_dir' ], f"{ isl } -{ osl } " )
109+ if log_dir is None :
110+ # Create base log directory path
111+ date_prefix = datetime .now ().strftime ("%Y%m%d" )
112+ log_base = os .path .join (env_config ['work_dir' ], f"{ date_prefix } /{ isl } -{ osl } " )
106113
107- # Get eplb num_slots for gen worker
108- load_balancer_config = config ['worker_config' ]['gen' ].get (
109- 'moe_config' , {}).get ('load_balancer' , {})
110- if isinstance (load_balancer_config , str ):
111- with open (load_balancer_config , 'r' ) as f :
112- load_balancer_config = yaml .safe_load (f )
113- eplb_num_slots = load_balancer_config .get ('num_slots' , 0 )
114+ # Get eplb num_slots for gen worker
115+ load_balancer_config = config ['worker_config' ]['gen' ].get (
116+ 'moe_config' , {}).get ('load_balancer' , {})
117+ if isinstance (load_balancer_config , str ):
118+ with open (load_balancer_config , 'r' ) as f :
119+ load_balancer_config = yaml .safe_load (f )
120+ eplb_num_slots = load_balancer_config .get ('num_slots' , 0 )
114121
115- # Determine directory suffix based on attention_dp
116- if gen_enable_attention_dp :
117- dir_suffix = f"ctx{ ctx_num } _gen{ gen_num } _dep{ gen_tp_size } _batch{ gen_batch_size } _eplb{ eplb_num_slots } _mtp{ mtp_size } "
118- else :
119- dir_suffix = f"ctx{ ctx_num } _gen{ gen_num } _tep{ gen_tp_size } _batch{ gen_batch_size } _eplb{ eplb_num_slots } _mtp{ mtp_size } "
122+ # Determine directory suffix based on attention_dp
123+ if gen_enable_attention_dp :
124+ dir_suffix = f"ctx{ ctx_num } _gen{ gen_num } _dep{ gen_tp_size } _batch{ gen_batch_size } _eplb{ eplb_num_slots } _mtp{ mtp_size } "
125+ else :
126+ dir_suffix = f"ctx{ ctx_num } _gen{ gen_num } _tep{ gen_tp_size } _batch{ gen_batch_size } _eplb{ eplb_num_slots } _mtp{ mtp_size } "
127+
128+ # Create full log directory path
129+ log_dir = os .path .join (log_base , dir_suffix )
120130
121- # Create full log directory path
122- log_dir = os .path .join (log_base , dir_suffix )
123131 # Remove existing directory if it exists
124132 if os .path .exists (log_dir ):
125133 shutil .rmtree (log_dir )
@@ -231,7 +239,7 @@ def main():
231239 print (f"\n Processing: { config_file } " )
232240 try :
233241 config = load_config (config_file )
234- submit_job (config )
242+ submit_job (config , args . log_dir )
235243 print (f"Successfully submitted job for: { config_file } " )
236244 except Exception as e :
237245 print (f"Error processing { config_file } : { e } " , file = sys .stderr )
0 commit comments