Skip to content

Commit 2a580fa

Browse files
author
Ralph Castain
authored
Merge pull request #3801 from rhc54/topic/hetero
Detect that we have a mix of BE/LE in the system
2 parents 753e3b0 + 2753f53 commit 2a580fa

File tree

4 files changed

+58
-9
lines changed

4 files changed

+58
-9
lines changed

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,7 @@ AC_CACHE_SAVE
588588
opal_show_title "Header file tests"
589589

590590
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
591-
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \
591+
dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
592592
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
593593
poll.h pthread.h pty.h pwd.h sched.h \
594594
strings.h stropts.h linux/ethtool.h linux/sockios.h \

opal/mca/hwloc/base/hwloc_base_util.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
#ifdef HAVE_UNISTD_H
3333
#include <unistd.h>
3434
#endif
35+
#ifdef HAVE_ENDIAN_H
36+
#include <endian.h>
37+
#endif
3538

3639
#include "opal/runtime/opal.h"
3740
#include "opal/constants.h"
@@ -2155,7 +2158,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
21552158
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
21562159
{
21572160
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
2158-
char *sig=NULL, *arch=NULL;
2161+
char *sig=NULL, *arch = NULL, *endian;
21592162
hwloc_obj_t obj;
21602163
unsigned i;
21612164

@@ -2175,14 +2178,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
21752178
break;
21762179
}
21772180
}
2178-
21792181
if (NULL == arch) {
2180-
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH",
2181-
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
2182-
} else {
2183-
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
2184-
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
2182+
arch = "unknown";
21852183
}
2184+
2185+
#ifdef __BYTE_ORDER
2186+
#if __BYTE_ORDER == __LITTLE_ENDIAN
2187+
endian = "le";
2188+
#else
2189+
endian = "be";
2190+
#endif
2191+
#else
2192+
endian = "unknown";
2193+
#endif
2194+
2195+
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
2196+
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
21862197
return sig;
21872198
}
21882199

orte/mca/plm/base/help-plm-base.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2015 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
162162
no active out-of-band transports and therefore cannot execute this call.
163163
Please check to see if you have the "oob" MCA parameter set and ensure
164164
that it is either unset or at least includes the tcp transport.
165+
#
166+
[multi-endian]
167+
Open MPI does not currently support multi-endian operations. We have
168+
detected that the following node differs in endianness:
169+
170+
171+
Nodename: %s
172+
Endian: %s
173+
Local endian: %s
174+
175+
Please correct the situation and try again.

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,12 +1058,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10581058
orte_daemon_cmd_flag_t cmd;
10591059
int32_t flag;
10601060
opal_value_t *kv;
1061+
char *myendian;
10611062

10621063
/* get the daemon job, if necessary */
10631064
if (NULL == jdatorted) {
10641065
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
10651066
}
10661067

1068+
/* get my endianness */
1069+
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
1070+
if (NULL == t) {
1071+
/* should never happen */
1072+
myendian = "unknown";
1073+
} else {
1074+
myendian = strrchr(t->sig, ':');
1075+
++myendian;
1076+
}
1077+
10671078
/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
10681079
idx = 1;
10691080
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
@@ -1263,8 +1274,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
12631274
}
12641275
free(sig);
12651276
break;
1277+
} else {
1278+
/* check if the difference is due to the endianness */
1279+
ptr = strrchr(sig, ':');
1280+
++ptr;
1281+
if (0 != strcmp(ptr, myendian)) {
1282+
/* we don't currently handle multi-endian operations in the
1283+
* MPI support */
1284+
orte_show_help("help-plm-base", "multi-endian", true,
1285+
nodename, ptr, myendian);
1286+
orted_failed_launch = true;
1287+
if (NULL != topo) {
1288+
hwloc_topology_destroy(topo);
1289+
}
1290+
goto CLEANUP;
1291+
}
12661292
}
12671293
}
1294+
12681295
if (!found) {
12691296
/* nope - save the signature and request the complete topology from that node */
12701297
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,

0 commit comments

Comments
 (0)