@@ -343,8 +343,9 @@ static void spawn_timeout_cb(int fd, short event, void *cbdata)
343343 }
344344}
345345
346- static void stack_trace_recv (int status , pmix_proc_t * sender , pmix_data_buffer_t * buffer ,
347- prte_rml_tag_t tag , void * cbdata )
346+ void prte_plm_base_stack_trace_recv (int status , pmix_proc_t * sender ,
347+ pmix_data_buffer_t * buffer ,
348+ prte_rml_tag_t tag , void * cbdata )
348349{
349350 pmix_byte_object_t pbo ;
350351 pmix_data_buffer_t blob ;
@@ -494,13 +495,101 @@ static void stack_trace_timeout(int sd, short args, void *cbdata)
494495 PMIX_DESTRUCT (& parray );
495496}
496497
498+ static void dump_job (prte_job_t * jdata )
499+ {
500+ pmix_proc_t pc ;
501+ prte_proc_t * proc ;
502+ pmix_byte_object_t bo ;
503+ char * st ;
504+ int i ;
505+
506+ PMIX_LOAD_PROCID (& pc , jdata -> nspace , PMIX_RANK_WILDCARD );
507+ pmix_asprintf (& st , "DATA FOR JOB: %s\n" , PRTE_JOBID_PRINT (jdata -> nspace ));
508+ bo .bytes = st ;
509+ bo .size = strlen (st );
510+ PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
511+ free (st );
512+ pmix_asprintf (& st , "\tNum apps: %d\tNum procs: %d\tJobState: %s\tAbort: %s\n" ,
513+ (int ) jdata -> num_apps , (int ) jdata -> num_procs , prte_job_state_to_str (jdata -> state ),
514+ (PRTE_FLAG_TEST (jdata , PRTE_JOB_FLAG_ABORTED )) ? "True" : "False" );
515+ bo .bytes = st ;
516+ bo .size = strlen (st );
517+ PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
518+ free (st );
519+ pmix_asprintf (& st , "\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld\n\n\tProcs:\n" ,
520+ (long ) jdata -> num_launched , (long ) jdata -> num_reported ,
521+ (long ) jdata -> num_terminated );
522+ bo .bytes = st ;
523+ bo .size = strlen (st );
524+ PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
525+ free (st );
526+ for (i = 0 ; i < jdata -> procs -> size ; i ++ ) {
527+ if (NULL != (proc = (prte_proc_t * ) pmix_pointer_array_get_item (jdata -> procs , i ))) {
528+ pmix_asprintf (& st , "\t\tRank: %s\tNode: %s\tPID: %u\tState: %s\tExitCode %d\n" ,
529+ PRTE_VPID_PRINT (proc -> name .rank ),
530+ (NULL == proc -> node ) ? "UNKNOWN" : proc -> node -> name ,
531+ (unsigned int ) proc -> pid , prte_proc_state_to_str (proc -> state ),
532+ proc -> exit_code );
533+ bo .bytes = st ;
534+ bo .size = strlen (st );
535+ PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
536+ free (st );
537+ }
538+ }
539+ st = "\n" ;
540+ bo .bytes = st ;
541+ bo .size = strlen (st );
542+ PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
543+ }
544+
545+ static int get_traces (prte_job_t * jdata )
546+ {
547+ prte_daemon_cmd_flag_t command = PRTE_DAEMON_GET_STACK_TRACES ;
548+ pmix_data_buffer_t buffer ;
549+ pmix_byte_object_t bo ;
550+ pmix_proc_t pc ;
551+ pmix_status_t rc ;
552+
553+ PMIX_LOAD_PROCID (& pc , jdata -> nspace , PMIX_RANK_WILDCARD );
554+ bo .bytes = "Waiting for stack traces (this may take a few moments)...\n" ;
555+ bo .size = strlen (bo .bytes );
556+ PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
557+
558+
559+ /* setup the buffer */
560+ PMIX_DATA_BUFFER_CONSTRUCT (& buffer );
561+ /* pack the command */
562+ rc = PMIx_Data_pack (NULL , & buffer , & command , 1 , PMIX_UINT8 );
563+ if (PMIX_SUCCESS != rc ) {
564+ PMIX_ERROR_LOG (rc );
565+ PMIX_DATA_BUFFER_DESTRUCT (& buffer );
566+ return PRTE_ERROR ;
567+ }
568+ /* pack the jobid */
569+ rc = PMIx_Data_pack (NULL , & buffer , & jdata -> nspace , 1 , PMIX_PROC_NSPACE );
570+ if (PMIX_SUCCESS != rc ) {
571+ PMIX_ERROR_LOG (rc );
572+ PMIX_DATA_BUFFER_DESTRUCT (& buffer );
573+ return PRTE_ERROR ;
574+ }
575+ /* goes to all daemons */
576+ if (PRTE_SUCCESS != (rc = prte_grpcomm .xcast (PRTE_RML_TAG_DAEMON , & buffer ))) {
577+ PRTE_ERROR_LOG (rc );
578+ PMIX_DATA_BUFFER_DESTRUCT (& buffer );
579+ return PRTE_ERROR ;
580+ }
581+ PMIX_DATA_BUFFER_DESTRUCT (& buffer );
582+ return PRTE_SUCCESS ;
583+ }
584+
497585static void job_timeout_cb (int fd , short event , void * cbdata )
498586{
499587 prte_job_t * jdata = (prte_job_t * ) cbdata ;
500588 prte_timer_t * timer = NULL ;
501- prte_proc_t * proc , prc ;
589+ prte_proc_t * prc ;
590+ prte_job_t * child ;
502591 pmix_proc_t pc ;
503- int i , rc , timeout , * tp ;
592+ int rc , timeout , * tp , i ;
504593 pmix_pointer_array_t parray ;
505594 pmix_byte_object_t bo ;
506595 char * st ;
@@ -534,83 +623,31 @@ static void job_timeout_cb(int fd, short event, void *cbdata)
534623 if (prte_get_attribute (& jdata -> attributes , PRTE_JOB_REPORT_STATE , NULL , PMIX_BOOL )) {
535624 /* output the results - note that the output might need to go to a
536625 * tool instead of just to stderr, so we use the PMIx IOF deliver
537- * function to ensure it gets where it needs to go */
538- pmix_asprintf (& st , "DATA FOR JOB: %s\n" , PRTE_JOBID_PRINT (jdata -> nspace ));
539- bo .bytes = st ;
540- bo .size = strlen (st );
541- PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
542- free (st );
543- pmix_asprintf (& st , "\tNum apps: %d\tNum procs: %d\tJobState: %s\tAbort: %s\n" ,
544- (int ) jdata -> num_apps , (int ) jdata -> num_procs , prte_job_state_to_str (jdata -> state ),
545- (PRTE_FLAG_TEST (jdata , PRTE_JOB_FLAG_ABORTED )) ? "True" : "False" );
546- bo .bytes = st ;
547- bo .size = strlen (st );
548- PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
549- free (st );
550- pmix_asprintf (& st , "\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld\n\n\tProcs:\n" ,
551- (long ) jdata -> num_launched , (long ) jdata -> num_reported ,
552- (long ) jdata -> num_terminated );
553- bo .bytes = st ;
554- bo .size = strlen (st );
555- PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
556- free (st );
557- for (i = 0 ; i < jdata -> procs -> size ; i ++ ) {
558- if (NULL != (proc = (prte_proc_t * ) pmix_pointer_array_get_item (jdata -> procs , i ))) {
559- pmix_asprintf (& st , "\t\tRank: %s\tNode: %s\tPID: %u\tState: %s\tExitCode %d\n" ,
560- PRTE_VPID_PRINT (proc -> name .rank ),
561- (NULL == proc -> node ) ? "UNKNOWN" : proc -> node -> name ,
562- (unsigned int ) proc -> pid , prte_proc_state_to_str (proc -> state ),
563- proc -> exit_code );
564- bo .bytes = st ;
565- bo .size = strlen (st );
566- PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
567- free (st );
568- }
569- }
570- st = "\n" ;
571- bo .bytes = st ;
572- bo .size = strlen (st );
573- PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
626+ * function to ensure it gets where it needs to go. */
627+ dump_job (jdata );
628+ }
629+
630+ /* Do this for all its child jobs, if any */
631+ PMIX_LIST_FOREACH (child , & jdata -> children , prte_job_t ) {
632+ dump_job (child );
574633 }
575634
576635 /* see if they want stacktraces */
577636 if (prte_get_attribute (& jdata -> attributes , PRTE_JOB_STACKTRACES , NULL , PMIX_BOOL )) {
578637 /* if they asked for stack_traces, attempt to get them, but timeout
579638 * if we cannot do so */
580- prte_daemon_cmd_flag_t command = PRTE_DAEMON_GET_STACK_TRACES ;
581- pmix_data_buffer_t buffer ;
582-
583- bo .bytes = "Waiting for stack traces (this may take a few moments)...\n" ;
584- bo .size = strlen (bo .bytes );
585- PMIx_server_IOF_deliver (& pc , PMIX_FWD_STDERR_CHANNEL , & bo , NULL , 0 , NULL , NULL );
586-
587- /* set the recv */
588- PRTE_RML_RECV (PRTE_NAME_WILDCARD , PRTE_RML_TAG_STACK_TRACE ,
589- PRTE_RML_PERSISTENT , stack_trace_recv , NULL );
590-
591- /* setup the buffer */
592- PMIX_DATA_BUFFER_CONSTRUCT (& buffer );
593- /* pack the command */
594- rc = PMIx_Data_pack (NULL , & buffer , & command , 1 , PMIX_UINT8 );
595- if (PMIX_SUCCESS != rc ) {
596- PMIX_ERROR_LOG (rc );
597- PMIX_DATA_BUFFER_DESTRUCT (& buffer );
598- goto giveup ;
599- }
600- /* pack the jobid */
601- rc = PMIx_Data_pack (NULL , & buffer , & jdata -> nspace , 1 , PMIX_PROC_NSPACE );
602- if (PMIX_SUCCESS != rc ) {
603- PMIX_ERROR_LOG (rc );
604- PMIX_DATA_BUFFER_DESTRUCT (& buffer );
639+ rc = get_traces (jdata );
640+ if (PRTE_SUCCESS != rc ) {
605641 goto giveup ;
606642 }
607- /* goes to all daemons */
608- if (PRTE_SUCCESS != (rc = prte_grpcomm .xcast (PRTE_RML_TAG_DAEMON , & buffer ))) {
609- PRTE_ERROR_LOG (rc );
610- PMIX_DATA_BUFFER_DESTRUCT (& buffer );
611- goto giveup ;
643+ // get traces for child jobs too
644+ PMIX_LIST_FOREACH (child , & jdata -> children , prte_job_t ) {
645+ rc = get_traces (child );
646+ if (PRTE_SUCCESS != rc ) {
647+ goto giveup ;
648+ }
612649 }
613- PMIX_DATA_BUFFER_DESTRUCT ( & buffer );
650+
614651 /* we will terminate after we get the stack_traces, but set a timeout
615652 * just in case we never hear back from everyone */
616653 if (prte_stack_trace_wait_timeout > 0 ) {
@@ -629,11 +666,30 @@ static void job_timeout_cb(int fd, short event, void *cbdata)
629666giveup :
630667 /* abort the job */
631668 PMIX_CONSTRUCT (& parray , pmix_pointer_array_t );
632- PMIX_LOAD_PROCID (& prc .name , jdata -> nspace , PMIX_RANK_WILDCARD );
633- pmix_pointer_array_add (& parray , & prc );
669+ pmix_pointer_array_init (& parray ,
670+ PRTE_GLOBAL_ARRAY_BLOCK_SIZE ,
671+ PRTE_GLOBAL_ARRAY_MAX_SIZE ,
672+ PRTE_GLOBAL_ARRAY_BLOCK_SIZE );
673+
674+ prc = PMIX_NEW (prte_proc_t );
675+ PMIX_LOAD_PROCID (& prc -> name , jdata -> nspace , PMIX_RANK_WILDCARD );
676+ pmix_pointer_array_add (& parray , prc );
677+ PMIX_LIST_FOREACH (child , & jdata -> children , prte_job_t ) {
678+ prc = PMIX_NEW (prte_proc_t );
679+ PMIX_LOAD_PROCID (& prc -> name , child -> nspace , PMIX_RANK_WILDCARD );
680+ pmix_pointer_array_add (& parray , prc );
681+ }
634682 if (PRTE_SUCCESS != (rc = prte_plm .terminate_procs (& parray ))) {
635683 PRTE_ERROR_LOG (rc );
636684 }
685+ for (i = 0 ; i < parray .size ; i ++ ) {
686+ prc = (prte_proc_t * ) pmix_pointer_array_get_item (& parray , i );
687+ if (NULL == prc ) {
688+ continue ;
689+ }
690+ pmix_pointer_array_set_item (& parray , i , NULL );
691+ PMIX_RELEASE (prc );
692+ }
637693 PMIX_DESTRUCT (& parray );
638694}
639695
0 commit comments