Skip to content

Commit d31f173

Browse files
author
rhc54
authored
Merge pull request #2476 from rhc54/topic/dbgupdate
Bring forward the debugger-related changes
2 parents ef5ee73 + d5fd635 commit d31f173

File tree

11 files changed

+95
-19
lines changed

11 files changed

+95
-19
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@ orte/mca/sstore/orte_sstore.7
353353

354354
orte/test/mpi/abort
355355
orte/test/mpi/accept
356+
orte/test/mpi/attach
356357
orte/test/mpi/bad_exit
357358
orte/test/mpi/bcast_loop
358359
orte/test/mpi/concurrent_spawn

ompi/mca/rte/orte/rte_orte_module.c

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
33
* All rights reserved.
4-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
4+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
55
* Copyright (c) 2012-2014 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
@@ -133,6 +133,8 @@ void ompi_rte_wait_for_debugger(void)
133133
int debugger;
134134
opal_list_t *codes;
135135
opal_value_t *kv;
136+
char *evar;
137+
int time;
136138

137139
/* See lengthy comment in orte/tools/orterun/debuggers.c about
138140
orte_in_parallel_debugger */
@@ -152,6 +154,12 @@ void ompi_rte_wait_for_debugger(void)
152154
*/
153155
ompi_debugger_setup_dlls();
154156

157+
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
158+
time = strtol(evar, NULL, 10);
159+
sleep(time);
160+
return;
161+
}
162+
155163
if (orte_standalone_operation) {
156164
/* spin until debugger attaches and releases us */
157165
while (MPIR_debug_gate == 0) {

orte/mca/oob/base/oob_base_stubs.c

+5
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
120120
* this is a local proc we just haven't heard from
121121
* yet due to a race condition. Check that situation */
122122
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
123+
++msg->retries;
124+
if (msg->retries < orte_rml_base.max_retries) {
125+
ORTE_OOB_SEND(msg);
126+
return;
127+
}
123128
ORTE_OOB_SEND(msg);
124129
return;
125130
}

orte/mca/plm/base/plm_base_launch_support.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -839,8 +839,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
839839
}
840840

841841
cleanup:
842-
/* need to init_after_spawn for debuggers */
843-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
842+
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
843+
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
844+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
845+
}
844846

845847
OBJ_RELEASE(caddy);
846848
}

orte/mca/rml/base/base.h

+2
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ typedef struct {
102102
opal_pointer_array_t conduits; /* array to hold the open conduits */
103103
opal_list_t posted_recvs;
104104
opal_list_t unmatched_msgs;
105+
int max_retries;
105106
#if OPAL_ENABLE_TIMING
106107
bool timing;
107108
#endif
@@ -116,6 +117,7 @@ typedef struct {
116117
orte_process_name_t origin;
117118
int status; // returned status on send
118119
orte_rml_tag_t tag; // targeted tag
120+
int retries; // #times we have tried to send it
119121

120122
/* user's send callback functions and data */
121123
union {

orte/mca/rml/base/rml_base_frame.c

+9
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ static bool selected = false;
6363

6464
static int orte_rml_base_register(mca_base_register_flag_t flags)
6565
{
66+
orte_rml_base.max_retries = 3;
67+
mca_base_var_register("orte", "rml", "base", "max_retries",
68+
"Max #times to retry sending a message",
69+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
70+
OPAL_INFO_LVL_9,
71+
MCA_BASE_VAR_SCOPE_READONLY,
72+
&orte_rml_base.max_retries);
73+
6674
#if OPAL_ENABLE_TIMING
6775
orte_rml_base.timing = false;
6876
(void) mca_base_var_register ("orte", "rml", "base", "timing",
@@ -240,6 +248,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
240248
/*** RML CLASS INSTANCES ***/
241249
static void send_cons(orte_rml_send_t *ptr)
242250
{
251+
ptr->retries = 0;
243252
ptr->cbdata = NULL;
244253
ptr->iov = NULL;
245254
ptr->buffer = NULL;

orte/mca/schizo/singularity/schizo_singularity.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
4141
bool takeus = false;
4242
char *t2, *pth, *newenv;
4343

44-
if (NULL != orte_schizo_base.personalities) {
44+
if (NULL != orte_schizo_base.personalities &&
45+
NULL != jdata->personality) {
4546
/* see if we are included */
4647
for (i=0; NULL != jdata->personality[i]; i++) {
4748
if (0 == strcmp(jdata->personality[i], "singularity")) {
@@ -106,4 +107,3 @@ static int setup_fork(orte_job_t *jdata, orte_app_context_t *app)
106107

107108
return ORTE_SUCCESS;
108109
}
109-

orte/orted/orted_submit.c

+30-8
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,20 @@ int orte_submit_job(char *argv[], int *index,
896896
}
897897
}
898898

899+
/* check for debugger test envars and forward them if necessary */
900+
if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
901+
char *evar;
902+
evar = getenv("ORTE_TEST_DEBUGGER_SLEEP");
903+
for (i=0; i < (int)jdata->num_apps; i++) {
904+
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
905+
opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env);
906+
if (NULL != evar) {
907+
opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env);
908+
}
909+
}
910+
}
911+
}
912+
899913
/* check for suicide test directives */
900914
if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") ||
901915
NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) {
@@ -2149,8 +2163,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata)
21492163
*/
21502164
if (NULL != orte_debugger_test_daemon && !orte_debugger_test_attach) {
21512165
opal_output_verbose(2, orte_debug_output,
2152-
"%s No debugger test daemon specified",
2153-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2166+
"%s Debugger test daemon specified: %s",
2167+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2168+
orte_debugger_test_daemon);
21542169
goto launchit;
21552170
}
21562171
/* if we were given an auto-detect rate, then we want to setup
@@ -2362,6 +2377,8 @@ static void setup_debugger_job(void)
23622377
proc = OBJ_NEW(orte_proc_t);
23632378
proc->name.jobid = debugger->jobid;
23642379
proc->name.vpid = vpid++;
2380+
/* point the proc at the local ORTE daemon as its parent */
2381+
proc->parent = node->daemon->name.vpid;
23652382
/* set the local/node ranks - we don't actually care
23662383
* what these are, but the odls needs them
23672384
*/
@@ -2741,7 +2758,7 @@ static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line,
27412758
static void open_fifo(void)
27422759
{
27432760
if (orte_debugger_attach_fd > 0) {
2744-
close(orte_debugger_attach_fd);
2761+
close(orte_debugger_attach_fd);
27452762
}
27462763

27472764
orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0);
@@ -2760,10 +2777,16 @@ static void open_fifo(void)
27602777
return;
27612778
}
27622779

2763-
opal_output_verbose(2, orte_debug_output,
2764-
"%s Monitoring debugger attach fifo %s",
2765-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2766-
MPIR_attach_fifo);
2780+
if (orte_debugger_test_attach) {
2781+
opal_output(0, "%s Monitoring debugger attach fifo %s",
2782+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2783+
MPIR_attach_fifo);
2784+
} else {
2785+
opal_output_verbose(2, orte_debug_output,
2786+
"%s Monitoring debugger attach fifo %s",
2787+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2788+
MPIR_attach_fifo);
2789+
}
27672790
orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t));
27682791
opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd,
27692792
OPAL_EV_READ, attach_debugger, orte_debugger_attach);
@@ -3232,4 +3255,3 @@ void orte_profile_wakeup(int sd, short args, void *cbdata)
32323255
/* abort the job */
32333256
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
32343257
}
3235-

orte/test/mpi/Makefile

+1-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll iof
1+
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach
22

33
all: $(PROGS)
44

@@ -10,11 +10,6 @@ hello_output: hello_output.c
1010
hello_show_help: hello_show_help.c
1111
$(CC) $(CFLAGS) $(CFLAGS_INTERNAL) $^ -o $@
1212

13-
hello.sapp: hello.c myhello.spec
14-
$(CC) $(CFLAGS) $(CLAGS_INTERNAL) hello.c -o hello
15-
singularity build myhello.spec
16-
singularity install hello.sapp
17-
1813
CC = mpicc
1914
CFLAGS = -g --openmpi:linkall
2015
CFLAGS_INTERNAL = -I../../.. -I../../../orte/include -I../../../opal/include

orte/test/mpi/attach.c

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/* -*- C -*-
2+
*
3+
* $HEADER$
4+
*
5+
* The most basic of MPI applications
6+
*/
7+
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
#include <sys/types.h>
11+
#include <sys/stat.h>
12+
#include <fcntl.h>
13+
#include <unistd.h>
14+
15+
int main(int argc, char* argv[])
16+
{
17+
unsigned char fifo_cmd = 1;
18+
int fd;
19+
20+
if (1 > argc) {
21+
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
22+
exit(1);
23+
}
24+
25+
fd = open(argv[1], O_WRONLY);
26+
write(fd, &fifo_cmd, sizeof(unsigned char));
27+
close(fd);
28+
29+
return 0;
30+
}

orte/util/error_strings.c

+2
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
354354
return "FAULT TOLERANCE RESTART";
355355
case ORTE_JOB_STATE_ANY:
356356
return "ANY";
357+
case ORTE_JOB_STATE_DEBUGGER_DETACH:
358+
return "DEBUGGER DETACH";
357359
default:
358360
return "UNKNOWN STATE!";
359361
}

0 commit comments

Comments
 (0)