Skip to content

Fix debugger attach and cospawn of debugger daemons for the STAT debugger #2425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 2, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ orte/mca/sstore/orte_sstore.7

orte/test/mpi/abort
orte/test/mpi/accept
orte/test/mpi/attach
orte/test/mpi/bad_exit
orte/test/mpi/bcast_loop
orte/test/mpi/concurrent_spawn
Expand Down
10 changes: 9 additions & 1 deletion ompi/mca/rte/orte/rte_orte_module.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2012-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
Expand Down Expand Up @@ -104,6 +104,8 @@ void ompi_rte_wait_for_debugger(void)
{
int debugger;
orte_rml_recv_cb_t xfer;
char *evar;
int time;

/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
Expand All @@ -123,6 +125,12 @@ void ompi_rte_wait_for_debugger(void)
*/
ompi_debugger_setup_dlls();

if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
time = strtol(evar, NULL, 10);
sleep(time);
return;
}

if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
Expand Down
14 changes: 10 additions & 4 deletions orte/mca/oob/base/oob_base_stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/*
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -117,9 +117,16 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
* this is a local proc we just haven't heard from
* yet due to a race condition. Check that situation */
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
ORTE_OOB_SEND(msg);
return;
++msg->retries;
if (msg->retries < orte_rml_base.max_retries) {
ORTE_OOB_SEND(msg);
return;
}
}
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s CANNOT SEND TO %s: TAG %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->dst), msg->tag);
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
ORTE_RML_SEND_COMPLETE(msg);
return;
Expand Down Expand Up @@ -396,4 +403,3 @@ static void process_uri(char *uri)
}
opal_argv_free(uris);
}

9 changes: 5 additions & 4 deletions orte/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Copyright (c) 2009 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
Expand Down Expand Up @@ -757,9 +757,10 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
}

cleanup:
/* need to init_after_spawn for debuggers */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);

/* if this wasn't a debugger job, then need to init_after_spawn for debuggers */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
}
OBJ_RELEASE(caddy);
}

Expand Down
4 changes: 3 additions & 1 deletion orte/mca/rml/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -84,6 +84,7 @@ ORTE_DECLSPEC void orte_rml_base_comm_stop(void);
typedef struct {
opal_list_t posted_recvs;
opal_list_t unmatched_msgs;
int max_retries;
#if OPAL_ENABLE_TIMING
bool timing;
#endif
Expand Down Expand Up @@ -123,6 +124,7 @@ typedef struct {
orte_process_name_t origin;
int status; // returned status on send
orte_rml_tag_t tag; // targeted tag
int retries; // #times we have tried to send it

/* user's send callback functions and data */
union {
Expand Down
10 changes: 9 additions & 1 deletion orte/mca/rml/base/rml_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
&orte_rml_base_wrapper);
(void) mca_base_var_register_synonym(var_id, "orte", "rml",NULL,"wrapper", 0);

orte_rml_base.max_retries = 3;
mca_base_var_register("orte", "rml", "base", "max_retries",
"Max #times to retry sending a message",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_rml_base.max_retries);

#if OPAL_ENABLE_TIMING
orte_rml_base.timing = false;
(void) mca_base_var_register ("orte", "rml", "base", "timing",
Expand Down Expand Up @@ -259,6 +267,7 @@ void orte_rml_recv_callback(int status, orte_process_name_t* sender,
/*** RML CLASS INSTANCES ***/
static void send_cons(orte_rml_send_t *ptr)
{
ptr->retries = 0;
ptr->cbdata = NULL;
ptr->iov = NULL;
ptr->buffer = NULL;
Expand Down Expand Up @@ -325,4 +334,3 @@ static void prq_des(orte_rml_recv_request_t *ptr)
OBJ_CLASS_INSTANCE(orte_rml_recv_request_t,
opal_object_t,
prq_cons, prq_des);

6 changes: 5 additions & 1 deletion orte/mca/routed/radix/routed_radix.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -372,6 +372,10 @@ static orte_process_name_t get_route(orte_process_name_t *target)
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
/* find out what daemon hosts this proc */
if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
opal_output_verbose(2, orte_routed_base_framework.framework_output,
"%s ATTEMPTING TO SEND TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(target));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
ret = ORTE_NAME_INVALID;
goto found;
Expand Down
13 changes: 11 additions & 2 deletions orte/mca/schizo/base/schizo_base_stubs.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -22,7 +22,6 @@ int orte_schizo_base_parse_cli(char *personality,
orte_schizo_base_active_module_t *mod;

if (NULL == personality) {
opal_output(0, "NULL PERSONALITY");
return ORTE_ERR_NOT_SUPPORTED;
}

Expand Down Expand Up @@ -63,6 +62,11 @@ int orte_schizo_base_setup_fork(orte_job_t *jdata,
int rc;
orte_schizo_base_active_module_t *mod;

/* if no personality was specified, then nothing to do */
if (NULL == jdata->personality) {
return ORTE_SUCCESS;
}

OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) {
if (NULL != mod->module->setup_fork) {
Expand All @@ -81,6 +85,11 @@ int orte_schizo_base_setup_child(orte_job_t *jdata,
int rc;
orte_schizo_base_active_module_t *mod;

/* if no personality was specified, then nothing to do */
if (NULL == jdata->personality) {
return ORTE_SUCCESS;
}

OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) {
if (0 == strcmp(jdata->personality, mod->component->mca_component_name)) {
if (NULL != mod->module->setup_child) {
Expand Down
22 changes: 11 additions & 11 deletions orte/mca/state/base/state_base_fns.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -521,13 +521,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
/* update the proc state */
ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
pdata->state = state;
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
/* Clean up the session directory as if we were the process
* itself. This covers the case where the process died abnormally
* and didn't cleanup its own session directory.
*/
orte_session_dir_finalize(proc);
}
}
/* if we are trying to terminate and our routes are
* gone, then terminate ourselves IF no local procs
* remain (might be some from another job)
Expand All @@ -550,11 +550,11 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
}
/* return the allocated slot for reuse */
cleanup_node(pdata);
/* track job status */
jdata->num_terminated++;
if (jdata->num_terminated == jdata->num_procs) {
/* track job status */
jdata->num_terminated++;
if (jdata->num_terminated == jdata->num_procs) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
}
}
}

cleanup:
Expand Down Expand Up @@ -752,10 +752,10 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
* is maintained!
*/
if (1 < j) {
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
/* this was a debugger daemon. notify that a debugger has detached */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
}
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
/* this was a debugger daemon. notify that a debugger has detached */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH);
}
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
}
Expand Down
4 changes: 2 additions & 2 deletions orte/runtime/orte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -276,7 +276,7 @@ int orte_register_params(void)
"Test debugger colaunch after debugger attachment",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_debugger_test_daemon);
&orte_debugger_test_attach);

orte_debugger_check_rate = 0;
(void) mca_base_var_register ("orte", "orte", NULL, "debugger_check_rate",
Expand Down
2 changes: 1 addition & 1 deletion orte/test/mpi/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach

all: $(PROGS)

Expand Down
30 changes: 30 additions & 0 deletions orte/test/mpi/attach.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/* -*- C -*-
*
* $HEADER$
*
* The most basic of MPI applications
*/

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

int main(int argc, char* argv[])
{
unsigned char fifo_cmd = 1;
int fd;

if (1 > argc) {
fprintf(stderr, "usage: attach <full-path-to-debugger-fifo-file>\n");
exit(1);
}

fd = open(argv[1], O_WRONLY);
write(fd, &fifo_cmd, sizeof(unsigned char));
close(fd);

return 0;
}
Loading