Skip to content

Commit 3b6f654

Browse files
authored
Merge pull request #3709 from rhc54/cmr2x/debugger
Update the debugger support so it properly launches under a debugger, and supports attach to a running job
2 parents f9fd6bf + 09bfb66 commit 3b6f654

File tree

2 files changed

+74
-32
lines changed

2 files changed

+74
-32
lines changed

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
711711
/* setup the environment for this app */
712712
if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
713713

714-
OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
714+
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
715715
"%s odls:launch:setup_fork failed with error %s",
716716
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
717717
ORTE_ERROR_NAME(rc)));

orte/tools/orterun/orterun.c

Lines changed: 73 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2347,9 +2347,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
23472347

23482348
if (!MPIR_being_debugged && !orte_in_parallel_debugger) {
23492349
/* if we were given a test debugger, then we still want to
2350-
* colaunch it
2350+
* colaunch it - unless we are testing attach to a running job
23512351
*/
2352-
if (NULL != orte_debugger_test_daemon) {
2352+
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
23532353
opal_output_verbose(2, orte_debug_output,
23542354
"%s No debugger test daemon specified",
23552355
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
@@ -2413,8 +2413,8 @@ static void setup_debugger_job(void)
24132413
* to avoid confusing the rest of the system's bookkeeping
24142414
*/
24152415
orte_plm_base_create_jobid(debugger);
2416-
/* set the personality to ORTE */
2417-
debugger->personality = strdup("orte");
2416+
/* set the personality to OMPI */
2417+
debugger->personality = strdup("ompi");
24182418
/* flag the job as being debugger daemons */
24192419
ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_DEBUGGER_DAEMON);
24202420
/* unless directed, we do not forward output */
@@ -2478,6 +2478,9 @@ static void setup_debugger_job(void)
24782478
proc = OBJ_NEW(orte_proc_t);
24792479
proc->name.jobid = debugger->jobid;
24802480
proc->name.vpid = vpid++;
2481+
/* point the proc at the local ORTE daemon as its parent */
2482+
proc->parent = node->daemon->name.vpid;
2483+
24812484
/* set the local/node ranks - we don't actually care
24822485
* what these are, but the odls needs them
24832486
*/
@@ -2518,43 +2521,63 @@ static bool mpir_breakpoint_fired = false;
25182521
void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
25192522
{
25202523
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
2521-
orte_job_t *jdata = caddy->jdata;
2524+
orte_job_t *jdata = caddy->jdata, *target;
25222525
orte_proc_t *proc;
25232526
orte_app_context_t *appctx;
25242527
orte_vpid_t i, j;
25252528
opal_buffer_t *buf;
25262529
int rc;
25272530
char **aliases, *aptr;
25282531

2532+
opal_output_verbose(5, orte_debug_output,
2533+
"%s INIT AFTER SPAWN FOR %s",
2534+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2535+
ORTE_JOBID_PRINT(caddy->jdata->jobid));
2536+
25292537
/* if we couldn't get thru the mapper stage, we might
25302538
* enter here with no procs. Avoid the "zero byte malloc"
25312539
* message by checking here
25322540
*/
25332541
if (MPIR_proctable || 0 == jdata->num_procs) {
2542+
25342543
/* already initialized */
25352544
opal_output_verbose(5, orte_debug_output,
25362545
"%s: debugger already initialized or zero procs",
25372546
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2538-
OBJ_RELEASE(caddy);
2539-
if (!mpir_breakpoint_fired) {
2540-
/* record that we have triggered the debugger */
2541-
mpir_breakpoint_fired = true;
25422547

2543-
/* trigger the debugger */
2544-
MPIR_Breakpoint();
2545-
2546-
/* send a message to rank=0 to release it */
2547-
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) ||
2548-
ORTE_PROC_STATE_UNTERMINATED < proc->state ) {
2549-
/* proc is already dead */
2550-
return;
2551-
}
2552-
buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */
2553-
if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf,
2554-
ORTE_RML_TAG_DEBUGGER_RELEASE,
2555-
orte_rml_send_callback, NULL))) {
2556-
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
2557-
OBJ_RELEASE(buf);
2548+
if (MPIR_being_debugged || NULL != orte_debugger_test_daemon ||
2549+
NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
2550+
OBJ_RELEASE(caddy);
2551+
if (!mpir_breakpoint_fired) {
2552+
/* record that we have triggered the debugger */
2553+
mpir_breakpoint_fired = true;
2554+
2555+
/* trigger the debugger */
2556+
MPIR_Breakpoint();
2557+
2558+
/* send a message to rank=0 of the job being debugged to release it */
2559+
target = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1);
2560+
if (NULL == target) {
2561+
/* the job is dead */
2562+
return;
2563+
}
2564+
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(target->procs, 0)) ||
2565+
ORTE_PROC_STATE_UNTERMINATED < proc->state ) {
2566+
/* proc is already dead */
2567+
return;
2568+
}
2569+
buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */
2570+
opal_output_verbose(5, orte_debug_output,
2571+
"%s SENDING DEBUGGER RELEASE TO %s %s:%d",
2572+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2573+
ORTE_NAME_PRINT(&proc->name),
2574+
__FILE__, __LINE__);
2575+
if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf,
2576+
ORTE_RML_TAG_DEBUGGER_RELEASE,
2577+
orte_rml_send_callback, NULL))) {
2578+
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
2579+
OBJ_RELEASE(buf);
2580+
}
25582581
}
25592582
}
25602583
return;
@@ -2649,8 +2672,13 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
26492672
/* trigger the debugger */
26502673
MPIR_Breakpoint();
26512674

2652-
/* send a message to rank=0 to release it */
2653-
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) ||
2675+
/* send a message to rank=0 of the job being debugged to release it */
2676+
target = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1);
2677+
if (NULL == target) {
2678+
/* the job is dead */
2679+
return;
2680+
}
2681+
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(target->procs, 0)) ||
26542682
ORTE_PROC_STATE_UNTERMINATED < proc->state) {
26552683
/* proc is already dead or never registered with us (so we don't have
26562684
* contact info for him)
@@ -2668,7 +2696,7 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
26682696
opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
26692697
OBJ_RELEASE(buf);
26702698
}
2671-
} else {
2699+
} else if (!orte_debugger_test_attach) {
26722700
/* if I am launching debugger daemons, then I need to do so now
26732701
* that the job has been started and I know which nodes have
26742702
* apps on them
@@ -2720,17 +2748,25 @@ static void open_fifo (void)
27202748
return;
27212749
}
27222750

2723-
opal_output_verbose(2, orte_debug_output,
2724-
"%s Monitoring debugger attach fifo %s",
2725-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2726-
MPIR_attach_fifo);
2751+
if (orte_debugger_test_attach) {
2752+
opal_output(0, "%s Monitoring debugger attach fifo %s",
2753+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2754+
MPIR_attach_fifo);
2755+
} else {
2756+
opal_output_verbose(2, orte_debug_output,
2757+
"%s Monitoring debugger attach fifo %s",
2758+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2759+
MPIR_attach_fifo);
2760+
}
27272761
attach = (opal_event_t*)malloc(sizeof(opal_event_t));
27282762
opal_event_set(orte_event_base, attach, attach_fd, OPAL_EV_READ, attach_debugger, attach);
27292763

27302764
fifo_active = true;
27312765
opal_event_add(attach, 0);
27322766
}
27332767

2768+
static bool did_once = false;
2769+
27342770
static void attach_debugger(int fd, short event, void *arg)
27352771
{
27362772
unsigned char fifo_cmd;
@@ -2786,6 +2822,12 @@ static void attach_debugger(int fd, short event, void *arg)
27862822
(NULL == orte_debugger_test_daemon) ?
27872823
MPIR_executable_path : orte_debugger_test_daemon);
27882824
setup_debugger_job();
2825+
did_once = true;
2826+
}
2827+
2828+
/* if we are testing, ensure we only do this once */
2829+
if (NULL != orte_debugger_test_daemon && did_once) {
2830+
return;
27892831
}
27902832

27912833
/* reset the read or timer event */

0 commit comments

Comments
 (0)