@@ -2347,9 +2347,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
2347
2347
2348
2348
if (!MPIR_being_debugged && !orte_in_parallel_debugger ) {
2349
2349
/* if we were given a test debugger, then we still want to
2350
- * colaunch it
2350
+ * colaunch it - unless we are testing attach to a running job
2351
2351
*/
2352
- if (NULL != orte_debugger_test_daemon ) {
2352
+ if (NULL != orte_debugger_test_daemon && ! orte_debugger_test_attach ) {
2353
2353
opal_output_verbose (2 , orte_debug_output ,
2354
2354
"%s No debugger test daemon specified" ,
2355
2355
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
@@ -2413,8 +2413,8 @@ static void setup_debugger_job(void)
2413
2413
* to avoid confusing the rest of the system's bookkeeping
2414
2414
*/
2415
2415
orte_plm_base_create_jobid (debugger );
2416
- /* set the personality to ORTE */
2417
- debugger -> personality = strdup ("orte " );
2416
+ /* set the personality to OMPI */
2417
+ debugger -> personality = strdup ("ompi " );
2418
2418
/* flag the job as being debugger daemons */
2419
2419
ORTE_FLAG_SET (debugger , ORTE_JOB_FLAG_DEBUGGER_DAEMON );
2420
2420
/* unless directed, we do not forward output */
@@ -2478,6 +2478,9 @@ static void setup_debugger_job(void)
2478
2478
proc = OBJ_NEW (orte_proc_t );
2479
2479
proc -> name .jobid = debugger -> jobid ;
2480
2480
proc -> name .vpid = vpid ++ ;
2481
+ /* point the proc at the local ORTE daemon as its parent */
2482
+ proc -> parent = node -> daemon -> name .vpid ;
2483
+
2481
2484
/* set the local/node ranks - we don't actually care
2482
2485
* what these are, but the odls needs them
2483
2486
*/
@@ -2518,43 +2521,63 @@ static bool mpir_breakpoint_fired = false;
2518
2521
void orte_debugger_init_after_spawn (int fd , short event , void * cbdata )
2519
2522
{
2520
2523
orte_state_caddy_t * caddy = (orte_state_caddy_t * )cbdata ;
2521
- orte_job_t * jdata = caddy -> jdata ;
2524
+ orte_job_t * jdata = caddy -> jdata , * target ;
2522
2525
orte_proc_t * proc ;
2523
2526
orte_app_context_t * appctx ;
2524
2527
orte_vpid_t i , j ;
2525
2528
opal_buffer_t * buf ;
2526
2529
int rc ;
2527
2530
char * * aliases , * aptr ;
2528
2531
2532
+ opal_output_verbose (5 , orte_debug_output ,
2533
+ "%s INIT AFTER SPAWN FOR %s" ,
2534
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
2535
+ ORTE_JOBID_PRINT (caddy -> jdata -> jobid ));
2536
+
2529
2537
/* if we couldn't get thru the mapper stage, we might
2530
2538
* enter here with no procs. Avoid the "zero byte malloc"
2531
2539
* message by checking here
2532
2540
*/
2533
2541
if (MPIR_proctable || 0 == jdata -> num_procs ) {
2542
+
2534
2543
/* already initialized */
2535
2544
opal_output_verbose (5 , orte_debug_output ,
2536
2545
"%s: debugger already initialized or zero procs" ,
2537
2546
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
2538
- OBJ_RELEASE (caddy );
2539
- if (!mpir_breakpoint_fired ) {
2540
- /* record that we have triggered the debugger */
2541
- mpir_breakpoint_fired = true;
2542
2547
2543
- /* trigger the debugger */
2544
- MPIR_Breakpoint ();
2545
-
2546
- /* send a message to rank=0 to release it */
2547
- if (NULL == (proc = (orte_proc_t * )opal_pointer_array_get_item (jdata -> procs , 0 )) ||
2548
- ORTE_PROC_STATE_UNTERMINATED < proc -> state ) {
2549
- /* proc is already dead */
2550
- return ;
2551
- }
2552
- buf = OBJ_NEW (opal_buffer_t ); /* don't need anything in this */
2553
- if (0 > (rc = orte_rml .send_buffer_nb (& proc -> name , buf ,
2554
- ORTE_RML_TAG_DEBUGGER_RELEASE ,
2555
- orte_rml_send_callback , NULL ))) {
2556
- opal_output (0 , "Error: could not send debugger release to MPI procs - error %s" , ORTE_ERROR_NAME (rc ));
2557
- OBJ_RELEASE (buf );
2548
+ if (MPIR_being_debugged || NULL != orte_debugger_test_daemon ||
2549
+ NULL != getenv ("ORTE_TEST_DEBUGGER_ATTACH" )) {
2550
+ OBJ_RELEASE (caddy );
2551
+ if (!mpir_breakpoint_fired ) {
2552
+ /* record that we have triggered the debugger */
2553
+ mpir_breakpoint_fired = true;
2554
+
2555
+ /* trigger the debugger */
2556
+ MPIR_Breakpoint ();
2557
+
2558
+ /* send a message to rank=0 of the job being debugged to release it */
2559
+ target = (orte_job_t * )opal_pointer_array_get_item (orte_job_data , 1 );
2560
+ if (NULL == target ) {
2561
+ /* the job is dead */
2562
+ return ;
2563
+ }
2564
+ if (NULL == (proc = (orte_proc_t * )opal_pointer_array_get_item (target -> procs , 0 )) ||
2565
+ ORTE_PROC_STATE_UNTERMINATED < proc -> state ) {
2566
+ /* proc is already dead */
2567
+ return ;
2568
+ }
2569
+ buf = OBJ_NEW (opal_buffer_t ); /* don't need anything in this */
2570
+ opal_output_verbose (5 , orte_debug_output ,
2571
+ "%s SENDING DEBUGGER RELEASE TO %s %s:%d" ,
2572
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
2573
+ ORTE_NAME_PRINT (& proc -> name ),
2574
+ __FILE__ , __LINE__ );
2575
+ if (0 > (rc = orte_rml .send_buffer_nb (& proc -> name , buf ,
2576
+ ORTE_RML_TAG_DEBUGGER_RELEASE ,
2577
+ orte_rml_send_callback , NULL ))) {
2578
+ opal_output (0 , "Error: could not send debugger release to MPI procs - error %s" , ORTE_ERROR_NAME (rc ));
2579
+ OBJ_RELEASE (buf );
2580
+ }
2558
2581
}
2559
2582
}
2560
2583
return ;
@@ -2649,8 +2672,13 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
2649
2672
/* trigger the debugger */
2650
2673
MPIR_Breakpoint ();
2651
2674
2652
- /* send a message to rank=0 to release it */
2653
- if (NULL == (proc = (orte_proc_t * )opal_pointer_array_get_item (jdata -> procs , 0 )) ||
2675
+ /* send a message to rank=0 of the job being debugged to release it */
2676
+ target = (orte_job_t * )opal_pointer_array_get_item (orte_job_data , 1 );
2677
+ if (NULL == target ) {
2678
+ /* the job is dead */
2679
+ return ;
2680
+ }
2681
+ if (NULL == (proc = (orte_proc_t * )opal_pointer_array_get_item (target -> procs , 0 )) ||
2654
2682
ORTE_PROC_STATE_UNTERMINATED < proc -> state ) {
2655
2683
/* proc is already dead or never registered with us (so we don't have
2656
2684
* contact info for him)
@@ -2668,7 +2696,7 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
2668
2696
opal_output (0 , "Error: could not send debugger release to MPI procs - error %s" , ORTE_ERROR_NAME (rc ));
2669
2697
OBJ_RELEASE (buf );
2670
2698
}
2671
- } else {
2699
+ } else if (! orte_debugger_test_attach ) {
2672
2700
/* if I am launching debugger daemons, then I need to do so now
2673
2701
* that the job has been started and I know which nodes have
2674
2702
* apps on them
@@ -2720,17 +2748,25 @@ static void open_fifo (void)
2720
2748
return ;
2721
2749
}
2722
2750
2723
- opal_output_verbose (2 , orte_debug_output ,
2724
- "%s Monitoring debugger attach fifo %s" ,
2725
- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
2726
- MPIR_attach_fifo );
2751
+ if (orte_debugger_test_attach ) {
2752
+ opal_output (0 , "%s Monitoring debugger attach fifo %s" ,
2753
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
2754
+ MPIR_attach_fifo );
2755
+ } else {
2756
+ opal_output_verbose (2 , orte_debug_output ,
2757
+ "%s Monitoring debugger attach fifo %s" ,
2758
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
2759
+ MPIR_attach_fifo );
2760
+ }
2727
2761
attach = (opal_event_t * )malloc (sizeof (opal_event_t ));
2728
2762
opal_event_set (orte_event_base , attach , attach_fd , OPAL_EV_READ , attach_debugger , attach );
2729
2763
2730
2764
fifo_active = true;
2731
2765
opal_event_add (attach , 0 );
2732
2766
}
2733
2767
2768
+ static bool did_once = false;
2769
+
2734
2770
static void attach_debugger (int fd , short event , void * arg )
2735
2771
{
2736
2772
unsigned char fifo_cmd ;
@@ -2786,6 +2822,12 @@ static void attach_debugger(int fd, short event, void *arg)
2786
2822
(NULL == orte_debugger_test_daemon ) ?
2787
2823
MPIR_executable_path : orte_debugger_test_daemon );
2788
2824
setup_debugger_job ();
2825
+ did_once = true;
2826
+ }
2827
+
2828
+ /* if we are testing, ensure we only do this once */
2829
+ if (NULL != orte_debugger_test_daemon && did_once ) {
2830
+ return ;
2789
2831
}
2790
2832
2791
2833
/* reset the read or timer event */
0 commit comments