1
1

Detect that we have a mix of BE/LE in the system, provide a warning that OMPI doesn't currently support this environment, and error out

Fixes #2817

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-07-03 15:47:05 -07:00
родитель d1c5955b73
Коммит 2753f53e6d
4 изменённых файлов: 58 добавлений и 9 удалений

Просмотреть файл

@ -588,7 +588,7 @@ AC_CACHE_SAVE
opal_show_title "Header file tests" opal_show_title "Header file tests"
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \ AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \ dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \ libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
poll.h pthread.h pty.h pwd.h sched.h \ poll.h pthread.h pty.h pwd.h sched.h \
strings.h stropts.h linux/ethtool.h linux/sockios.h \ strings.h stropts.h linux/ethtool.h linux/sockios.h \

Просмотреть файл

@ -32,6 +32,9 @@
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif #endif
#ifdef HAVE_ENDIAN_H
#include <endian.h>
#endif
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/constants.h" #include "opal/constants.h"
@ -2155,7 +2158,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo) char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
{ {
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt; int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
char *sig=NULL, *arch=NULL; char *sig=NULL, *arch = NULL, *endian;
hwloc_obj_t obj; hwloc_obj_t obj;
unsigned i; unsigned i;
@ -2175,14 +2178,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
break; break;
} }
} }
if (NULL == arch) { if (NULL == arch) {
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH", arch = "unknown";
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
} else {
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
} }
#ifdef __BYTE_ORDER
#if __BYTE_ORDER == __LITTLE_ENDIAN
endian = "le";
#else
endian = "be";
#endif
#else
endian = "unknown";
#endif
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
return sig; return sig;
} }

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved. # Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
no active out-of-band transports and therefore cannot execute this call. no active out-of-band transports and therefore cannot execute this call.
Please check to see if you have the "oob" MCA parameter set and ensure Please check to see if you have the "oob" MCA parameter set and ensure
that it is either unset or at least includes the tcp transport. that it is either unset or at least includes the tcp transport.
#
[multi-endian]
Open MPI does not currently support multi-endian operations. We have
detected that the following node differs in endianness:
Nodename: %s
Endian: %s
Local endian: %s
Please correct the situation and try again.

Просмотреть файл

@ -1058,12 +1058,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
orte_daemon_cmd_flag_t cmd; orte_daemon_cmd_flag_t cmd;
int32_t flag; int32_t flag;
opal_value_t *kv; opal_value_t *kv;
char *myendian;
/* get the daemon job, if necessary */ /* get the daemon job, if necessary */
if (NULL == jdatorted) { if (NULL == jdatorted) {
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
} }
/* get my endianness */
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
if (NULL == t) {
/* should never happen */
myendian = "unknown";
} else {
myendian = strrchr(t->sig, ':');
++myendian;
}
/* multiple daemons could be in this buffer, so unpack until we exhaust the data */ /* multiple daemons could be in this buffer, so unpack until we exhaust the data */
idx = 1; idx = 1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) { while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
@ -1263,8 +1274,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
} }
free(sig); free(sig);
break; break;
} else {
/* check if the difference is due to the endianness */
ptr = strrchr(sig, ':');
++ptr;
if (0 != strcmp(ptr, myendian)) {
/* we don't currently handle multi-endian operations in the
* MPI support */
orte_show_help("help-plm-base", "multi-endian", true,
nodename, ptr, myendian);
orted_failed_launch = true;
if (NULL != topo) {
hwloc_topology_destroy(topo);
}
goto CLEANUP;
}
} }
} }
if (!found) { if (!found) {
/* nope - save the signature and request the complete topology from that node */ /* nope - save the signature and request the complete topology from that node */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,