Skip to content

Commit ce73959

Browse files
authored
Merge pull request #2425 from rhc54/topic/stat
Fix debugger attach and cospawn of debugger daemons for the STAT debugger
2 parents f8dae5f + 9c1f649 commit ce73959

File tree

14 files changed

+206
-77
lines changed

14 files changed

+206
-77
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ orte/mca/sstore/orte_sstore.7
341341

342342
orte/test/mpi/abort
343343
orte/test/mpi/accept
344+
orte/test/mpi/attach
344345
orte/test/mpi/bad_exit
345346
orte/test/mpi/bcast_loop
346347
orte/test/mpi/concurrent_spawn

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
33
* All rights reserved.
4-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
4+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
55
* Copyright (c) 2012-2014 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
@@ -104,6 +104,8 @@ void ompi_rte_wait_for_debugger(void)
104104
{
105105
int debugger;
106106
orte_rml_recv_cb_t xfer;
107+
char *evar;
108+
int time;
107109

108110
/* See lengthy comment in orte/tools/orterun/debuggers.c about
109111
orte_in_parallel_debugger */
@@ -123,6 +125,12 @@ void ompi_rte_wait_for_debugger(void)
123125
*/
124126
ompi_debugger_setup_dlls();
125127

128+
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
129+
time = strtol(evar, NULL, 10);
130+
sleep(time);
131+
return;
132+
}
133+
126134
if (orte_standalone_operation) {
127135
/* spin until debugger attaches and releases us */
128136
while (MPIR_debug_gate == 0) {

orte/mca/oob/base/oob_base_stubs.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
5+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
66
* $COPYRIGHT$
77
*
88
* Additional copyrights may follow
@@ -117,9 +117,16 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
117117
* this is a local proc we just haven't heard from
118118
* yet due to a race condition. Check that situation */
119119
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
120-
ORTE_OOB_SEND(msg);
121-
return;
120+
++msg->retries;
121+
if (msg->retries < orte_rml_base.max_retries) {
122+
ORTE_OOB_SEND(msg);
123+
return;
124+
}
122125
}
126+
opal_output_verbose(5, orte_oob_base_framework.framework_output,
127+
"%s CANNOT SEND TO %s: TAG %d",
128+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
129+
ORTE_NAME_PRINT(&msg->dst), msg->tag);
123130
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
124131
ORTE_RML_SEND_COMPLETE(msg);
125132
return;
@@ -396,4 +403,3 @@ static void process_uri(char *uri)
396403
}
397404
opal_argv_free(uris);
398405
}
399-

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2009 Institut National de Recherche en Informatique
1414
* et Automatique. All rights reserved.
1515
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
16-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014-2015 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
1919
* Copyright (c) 2016 IBM Corporation. All rights reserved.
@@ -757,9 +757,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
757757
}
758758

759759
cleanup:
760-
/* need to init_after_spawn for debuggers */
761-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
762-
760+
/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
761+
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
762+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
763+
}
763764
OBJ_RELEASE(caddy);
764765
}
765766

orte/mca/rml/base/base.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -84,6 +84,7 @@ ORTE_DECLSPEC void orte_rml_base_comm_stop(void);
8484
typedef struct {
8585
opal_list_t posted_recvs;
8686
opal_list_t unmatched_msgs;
87+
int max_retries;
8788
#if OPAL_ENABLE_TIMING
8889
bool timing;
8990
#endif
@@ -123,6 +124,7 @@ typedef struct {
123124
orte_process_name_t origin;
124125
int status; // returned status on send
125126
orte_rml_tag_t tag; // targeted tag
127+
int retries; // #times we have tried to send it
126128

127129
/* user's send callback functions and data */
128130
union {

orte/mca/rml/base/rml_base_frame.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
6161
&orte_rml_base_wrapper);
6262
(void) mca_base_var_register_synonym(var_id, "orte", "rml",NULL,"wrapper", 0);
6363

64+
orte_rml_base.max_retries = 3;
65+
mca_base_var_register("orte", "rml", "base", "max_retries",
66+
"Max #times to retry sending a message",
67+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
68+
OPAL_INFO_LVL_9,
69+
MCA_BASE_VAR_SCOPE_READONLY,
70+
&orte_rml_base.max_retries);
71+
6472
#if OPAL_ENABLE_TIMING
6573
orte_rml_base.timing = false;
6674
(void) mca_base_var_register ("orte", "rml", "base", "timing",
@@ -259,6 +267,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
259267
/*** RML CLASS INSTANCES ***/
260268
static void send_cons(orte_rml_send_t *ptr)
261269
{
270+
ptr->retries = 0;
262271
ptr->cbdata = NULL;
263272
ptr->iov = NULL;
264273
ptr->buffer = NULL;
@@ -325,4 +334,3 @@ static void prq_des(orte_rml_recv_request_t *ptr)
325334
OBJ_CLASS_INSTANCE(orte_rml_recv_request_t,
326335
opal_object_t,
327336
prq_cons, prq_des);
328-

orte/mca/routed/radix/routed_radix.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* reserved.
77
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
88
* reserved.
9-
* Copyright (c) 2013 Intel, Inc. All rights reserved.
9+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1010
* $COPYRIGHT$
1111
*
1212
* Additional copyrights may follow
@@ -372,6 +372,10 @@ static orte_process_name_t get_route(orte_process_name_t *target)
372372
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
373373
/* find out what daemon hosts this proc */
374374
if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
375+
opal_output_verbose(2, orte_routed_base_framework.framework_output,
376+
"%s ATTEMPTING TO SEND TO %s",
377+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
378+
ORTE_NAME_PRINT(target));
375379
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
376380
ret = ORTE_NAME_INVALID;
377381
goto found;

orte/mca/schizo/base/schizo_base_stubs.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -22,7 +22,6 @@ int orte_schizo_base_parse_cli(char *personality,
2222
orte_schizo_base_active_module_t *mod;
2323

2424
if (NULL == personality) {
25-
opal_output(0, "NULL PERSONALITY");
2625
return ORTE_ERR_NOT_SUPPORTED;
2726
}
2827

@@ -63,6 +62,11 @@ int orte_schizo_base_setup_fork(orte_job_t *jdata,
6362
int rc;
6463
orte_schizo_base_active_module_t *mod;
6564

65+
/* if no personality was specified, then nothing to do */
66+
if (NULL == jdata->personality) {
67+
return ORTE_SUCCESS;
68+
}
69+
6670
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
6771
if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) {
6872
if (NULL != mod->module->setup_fork) {
@@ -81,6 +85,11 @@ int orte_schizo_base_setup_child(orte_job_t *jdata,
8185
int rc;
8286
orte_schizo_base_active_module_t *mod;
8387

88+
/* if no personality was specified, then nothing to do */
89+
if (NULL == jdata->personality) {
90+
return ORTE_SUCCESS;
91+
}
92+
8493
OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
8594
if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) {
8695
if (NULL != mod->module->setup_child) {

orte/mca/state/base/state_base_fns.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
3-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -521,13 +521,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
521521
/* update the proc state */
522522
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
523523
pdata->state = state;
524-
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
524+
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
525525
/* Clean up the session directory as if we were the process
526526
* itself. This covers the case where the process died abnormally
527527
* and didn't cleanup its own session directory.
528528
*/
529529
orte_session_dir_finalize(proc);
530-
}
530+
}
531531
/* if we are trying to terminate and our routes are
532532
* gone, then terminate ourselves IF no local procs
533533
* remain (might be some from another job)
@@ -550,11 +550,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
550550
}
551551
/* return the allocated slot for reuse */
552552
cleanup_node(pdata);
553-
/* track job status */
554-
jdata->num_terminated++;
555-
if (jdata->num_terminated == jdata->num_procs) {
553+
/* track job status */
554+
jdata->num_terminated++;
555+
if (jdata->num_terminated == jdata->num_procs) {
556556
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
557-
}
557+
}
558558
}
559559

560560
cleanup:
@@ -752,10 +752,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
752752
* is maintained!
753753
*/
754754
if (1 < j) {
755-
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
756-
/* this was a debugger daemon. notify that a debugger has detached */
757-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
758-
}
755+
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
756+
/* this was a debugger daemon. notify that a debugger has detached */
757+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
758+
}
759759
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
760760
OBJ_RELEASE(jdata);
761761
}

orte/runtime/orte_mca_params.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
1414
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
1515
* All rights reserved
16-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
16+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
1919
* $COPYRIGHT$
@@ -276,7 +276,7 @@ int orte_register_params(void)
276276
"Test debugger colaunch after debugger attachment",
277277
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
278278
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
279-
&orte_debugger_test_daemon);
279+
&orte_debugger_test_attach);
280280

281281
orte_debugger_check_rate = 0;
282282
(void) mca_base_var_register ("orte", "orte", NULL, "debugger_check_rate",

orte/test/mpi/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll
1+
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach
22

33
all: $(PROGS)
44

orte/test/mpi/attach.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/* -*- C -*-
2+
*
3+
* $HEADER$
4+
*
5+
* The most basic of MPI applications
6+
*/
7+
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
#include <sys/types.h>
11+
#include <sys/stat.h>
12+
#include <fcntl.h>
13+
#include <unistd.h>
14+
15+
int main(int argc, char* argv[])
16+
{
17+
unsigned char fifo_cmd = 1;
18+
int fd;
19+
20+
if (1 > argc) {
21+
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
22+
exit(1);
23+
}
24+
25+
fd = open(argv[1], O_WRONLY);
26+
write(fd, &fifo_cmd, sizeof(unsigned char));
27+
close(fd);
28+
29+
return 0;
30+
}

0 commit comments

Comments
 (0)