Skip to content

Commit 4eeb415

Browse files
committed
odls/alps: resolve hang when launching with mpirun on Crays
This commit removes some code that protected the odls/alps component from closing alps file descriptors. For some unknown reason leaving these file descriptors open causes can cause an orted to hang when launching apps. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from commit 9817216) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent ea4d30b commit 4eeb415

File tree

1 file changed

+26
-49
lines changed

1 file changed

+26
-49
lines changed

orte/mca/odls/alps/odls_alps_module.c

Lines changed: 26 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
34
* University Research and Technology
@@ -13,7 +14,7 @@
1314
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
1415
* Copyright (c) 2008-2017 Cisco Systems, Inc. All rights reserved
1516
* Copyright (c) 2010 IBM Corporation. All rights reserved.
16-
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
17+
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
1718
* reserved.
1819
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1920
* Copyright (c) 2017 Rutgers, The State University of New Jersey.
@@ -109,6 +110,7 @@
109110
#include <dirent.h>
110111
#endif
111112

113+
#include <ctype.h>
112114

113115
#include "opal/mca/hwloc/hwloc-internal.h"
114116
#include "opal/mca/hwloc/base/base.h"
@@ -289,59 +291,30 @@ static void send_error_show_help(int fd, int exit_status,
289291
exit(exit_status);
290292
}
291293

292-
static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opts)
293-
{
294-
int rc, fd;
295-
DIR *dir = NULL;
296-
struct dirent *files;
297-
int app_alps_filedes[2], alps_app_filedes[2];
298-
299-
dir = opendir("/proc/self/fd");
294+
static int close_open_file_descriptors(int write_fd,
295+
orte_iof_base_io_conf_t opts) {
296+
DIR *dir = opendir("/proc/self/fd");
300297
if (NULL == dir) {
301298
return ORTE_ERR_FILE_OPEN_FAILURE;
302299
}
303-
304-
/* close all file descriptors w/ exception of stdin/stdout/stderr,
305-
the pipe used for the IOF INTERNAL messages, and the pipe up to
306-
the parent. Be careful to retain all of the pipe fd's set up
307-
by the apshephered. These are needed for obtaining RDMA credentials,
308-
synchronizing with aprun, etc. */
309-
310-
rc = alps_app_lli_pipes(app_alps_filedes,alps_app_filedes);
311-
if (0 != rc) {
312-
closedir(dir);
313-
return ORTE_ERR_FILE_OPEN_FAILURE;
314-
}
315-
316-
while ((files = readdir(dir)) != NULL) {
317-
if(!strncmp(files->d_name,".",1) || !strncmp(files->d_name,"..",2)) continue;
318-
319-
fd = strtoul(files->d_name, NULL, 10);
320-
if (EINVAL == errno || ERANGE == errno) {
300+
struct dirent *files;
301+
while (NULL != (files = readdir(dir))) {
302+
if (!isdigit(files->d_name[0])) {
303+
continue;
304+
}
305+
int fd = strtol(files->d_name, NULL, 10);
306+
if (errno == EINVAL || errno == ERANGE) {
321307
closedir(dir);
322308
return ORTE_ERR_TYPE_MISMATCH;
323309
}
324-
325-
/*
326-
* skip over the pipes we have open to apshepherd or slurmd
327-
*/
328-
329-
if (fd == XTAPI_FD_IDENTITY) continue;
330-
if (fd == XTAPI_FD_RESILIENCY) continue;
331-
if ((fd == app_alps_filedes[0]) ||
332-
(fd == app_alps_filedes[1]) ||
333-
(fd == alps_app_filedes[0]) ||
334-
(fd == alps_app_filedes[1])) continue;
335-
336310
if (fd >=3 &&
337311
#if OPAL_PMIX_V1
338312
fd != opts.p_internal[1] &&
339313
#endif
340314
fd != write_fd) {
341-
close(fd);
315+
close(fd);
342316
}
343317
}
344-
345318
closedir(dir);
346319
return ORTE_SUCCESS;
347320
}
@@ -368,14 +341,18 @@ static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
368341
always outputs a nice, single message indicating what
369342
happened
370343
*/
371-
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
372-
ORTE_ERROR_LOG(i);
373-
send_error_show_help(write_fd, 1,
374-
"help-orte-odls-alps.txt",
375-
"iof setup failed",
376-
orte_process_info.nodename, cd->app->app);
377-
/* Does not return */
378-
}
344+
345+
if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
346+
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
347+
ORTE_ERROR_LOG(i);
348+
send_error_show_help(write_fd, 1,
349+
"help-orte-odls-alps.txt",
350+
"iof setup failed",
351+
orte_process_info.nodename, cd->app->app);
352+
/* Does not return */
353+
}
354+
}
355+
379356

380357
/* now set any child-level controls such as binding */
381358
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);

0 commit comments

Comments
 (0)