Skip to content

Commit 3d4e193

Browse files
committed
Fix debugger attach and cospawn of debugger daemons for the STAT debugger. Add ability to test the support minus the actual debugger.
Fixes open-mpi#2411 Continue cleanup of STAT debugger attach: * Limit the number of times we retry sending of a message to avoid an infinite loop * Don't execute the "init_debugger_after_spawn" state for debugger jobs * Add a new test program "attach" that takes the debugger attach fifo as its argument, and then simulates attach by writing a byte down the fifo Output the attach fifo info if we are testing attach so we know where to attach to - otherwise, use the output_verbose Always send "debugger release" to the job actually being debugged, not the debugger itself Signed-off-by: Ralph Castain <[email protected]> Remove debug Signed-off-by: Ralph Castain <[email protected]> Conflicts: orte/tools/orterun/orterun.c (cherry picked from commit 8fa33bfdb3f6ca1667c558f06c3c8f0fb5482a8d)
1 parent 7e180d2 commit 3d4e193

File tree

5 files changed

+67
-16
lines changed

5 files changed

+67
-16
lines changed

orte/mca/oob/base/oob_base_stubs.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
138138
return;
139139
}
140140
}
141+
opal_output_verbose(5, orte_oob_base_framework.framework_output,
142+
"%s CANNOT SEND TO %s: TAG %d",
143+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
144+
ORTE_NAME_PRINT(&msg->dst), msg->tag);
141145
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
142146
ORTE_RML_SEND_COMPLETE(msg);
143147
return;

orte/mca/rml/base/rml_base_frame.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
7070
MCA_BASE_VAR_SCOPE_READONLY,
7171
&orte_rml_base.max_retries);
7272

73+
orte_rml_base.max_retries = 3;
74+
mca_base_var_register("orte", "rml", "base", "max_retries",
75+
"Max #times to retry sending a message",
76+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
77+
OPAL_INFO_LVL_9,
78+
MCA_BASE_VAR_SCOPE_READONLY,
79+
&orte_rml_base.max_retries);
80+
7381
#if OPAL_ENABLE_TIMING
7482
orte_rml_base.timing = false;
7583
(void) mca_base_var_register ("orte", "rml", "base", "timing",

orte/mca/routed/radix/routed_radix.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,10 @@ static orte_process_name_t get_route(orte_process_name_t *target)
274274
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
275275
/* find out what daemon hosts this proc */
276276
if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
277+
opal_output_verbose(2, orte_routed_base_framework.framework_output,
278+
"%s ATTEMPTING TO SEND TO %s",
279+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
280+
ORTE_NAME_PRINT(target));
277281
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
278282
ret = ORTE_NAME_INVALID;
279283
goto found;

orte/mca/state/base/state_base_fns.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -725,12 +725,9 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
725725
} else if (ORTE_PROC_STATE_TERMINATED == state) {
726726
/* update the proc state */
727727
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
728-
if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
729-
pdata->state = state;
730-
}
728+
/// opal_pmix.server_deregister_client(proc, NULL, NULL);
729+
pdata->state = state;
731730
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
732-
/* tell the PMIx subsystem to cleanup this client */
733-
opal_pmix.server_deregister_client(proc, NULL, NULL);
734731
/* Clean up the session directory as if we were the process
735732
* itself. This covers the case where the process died abnormally
736733
* and didn't cleanup its own session directory.
@@ -762,6 +759,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
762759
/* track job status */
763760
jdata->num_terminated++;
764761
if (jdata->num_terminated == jdata->num_procs) {
762+
<<<<<<< HEAD
765763
/* if requested, check fd status for leaks */
766764
if (orte_state_base_run_fdcheck) {
767765
orte_state_base_check_fds(jdata);
@@ -780,6 +778,9 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
780778
parent.jobid = jdata->jobid;
781779
parent.vpid = ORTE_VPID_WILDCARD;
782780
_send_notification(OPAL_ERR_PROC_ABORTED, pdata->state, &pdata->name, &parent);
781+
=======
782+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
783+
>>>>>>> 8fa33bfd... Fix debugger attach and cospawn of debugger daemons for the STAT debugger. Add ability to test the support minus the actual debugger.
783784
}
784785
}
785786

@@ -989,7 +990,11 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
989990
/* this was a debugger daemon. notify that a debugger has detached */
990991
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
991992
}
993+
<<<<<<< HEAD
992994
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
995+
=======
996+
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
997+
>>>>>>> 8fa33bfd... Fix debugger attach and cospawn of debugger daemons for the STAT debugger. Add ability to test the support minus the actual debugger.
993998
OBJ_RELEASE(jdata);
994999
}
9951000
}

orte/tools/orterun/orterun.c

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,6 +2005,9 @@ static void setup_debugger_job(void)
20052005
proc = OBJ_NEW(orte_proc_t);
20062006
proc->name.jobid = debugger->jobid;
20072007
proc->name.vpid = vpid++;
2008+
/* point the proc at the local ORTE daemon as its parent */
2009+
proc->parent = node->daemon->name.vpid;
2010+
20082011
/* set the local/node ranks - we don't actually care
20092012
* what these are, but the odls needs them
20102013
*/
@@ -2045,14 +2048,19 @@ static bool mpir_breakpoint_fired = false;
20452048
void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
20462049
{
20472050
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
2048-
orte_job_t *jdata = caddy->jdata;
2051+
orte_job_t *jdata = caddy->jdata, *target;
20492052
orte_proc_t *proc;
20502053
orte_app_context_t *appctx;
20512054
orte_vpid_t i, j;
20522055
opal_buffer_t *buf;
20532056
int rc;
20542057
char **aliases, *aptr;
20552058

2059+
opal_output_verbose(5, orte_debug_output,
2060+
"%s INIT AFTER SPAWN FOR %s",
2061+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2062+
ORTE_JOBID_PRINT(caddy->jdata->jobid));
2063+
20562064
/* if we couldn't get thru the mapper stage, we might
20572065
* enter here with no procs. Avoid the "zero byte malloc"
20582066
* message by checking here
@@ -2074,13 +2082,23 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
20742082
/* trigger the debugger */
20752083
MPIR_Breakpoint();
20762084

2077-
/* send a message to rank=0 to release it */
2078-
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) ||
2085+
/* send a message to rank=0 of the job being debugged to release it */
2086+
target = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1);
2087+
if (NULL == target) {
2088+
/* the job is dead */
2089+
return;
2090+
}
2091+
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(target->procs, 0)) ||
20792092
ORTE_PROC_STATE_UNTERMINATED < proc->state ) {
20802093
/* proc is already dead */
20812094
return;
20822095
}
20832096
buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */
2097+
opal_output_verbose(5, orte_debug_output,
2098+
"%s SENDING DEBUGGER RELEASE TO %s %s:%d",
2099+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2100+
ORTE_NAME_PRINT(&proc->name),
2101+
__FILE__, __LINE__);
20842102
if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf,
20852103
ORTE_RML_TAG_DEBUGGER_RELEASE,
20862104
orte_rml_send_callback, NULL))) {
@@ -2181,18 +2199,24 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata)
21812199
/* trigger the debugger */
21822200
MPIR_Breakpoint();
21832201

2184-
/* send a message to rank=0 to release it */
2185-
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) ||
2202+
/* send a message to rank=0 of the job being debugged to release it */
2203+
target = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1);
2204+
if (NULL == target) {
2205+
/* the job is dead */
2206+
return;
2207+
}
2208+
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(target->procs, 0)) ||
21862209
ORTE_PROC_STATE_UNTERMINATED < proc->state) {
21872210
/* proc is already dead or never registered with us (so we don't have
21882211
* contact info for him)
21892212
*/
21902213
return;
21912214
}
21922215
opal_output_verbose(2, orte_debug_output,
2193-
"%s sending debugger release to %s",
2216+
"%s SENDING DEBUGGER RELEASE TO %s %s:%d",
21942217
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2195-
ORTE_NAME_PRINT(&proc->name));
2218+
ORTE_NAME_PRINT(&proc->name),
2219+
__FILE__, __LINE__);
21962220
buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */
21972221
if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf,
21982222
ORTE_RML_TAG_DEBUGGER_RELEASE,
@@ -2252,10 +2276,16 @@ static void open_fifo (void)
22522276
return;
22532277
}
22542278

2255-
opal_output_verbose(2, orte_debug_output,
2256-
"%s Monitoring debugger attach fifo %s",
2257-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2258-
MPIR_attach_fifo);
2279+
if (orte_debugger_test_attach) {
2280+
opal_output(0, "%s Monitoring debugger attach fifo %s",
2281+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2282+
MPIR_attach_fifo);
2283+
} else {
2284+
opal_output_verbose(2, orte_debug_output,
2285+
"%s Monitoring debugger attach fifo %s",
2286+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2287+
MPIR_attach_fifo);
2288+
}
22592289
attach = (opal_event_t*)malloc(sizeof(opal_event_t));
22602290
opal_event_set(orte_event_base, attach, attach_fd, OPAL_EV_READ, attach_debugger, attach);
22612291

0 commit comments

Comments
 (0)