Detect that we have a mix of BE/LE in the system, provide a warning that OMPI doesn't currently support this environment, and error out
Fixes #2817 Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
d1c5955b73
Коммит
2753f53e6d
@ -588,7 +588,7 @@ AC_CACHE_SAVE
|
||||
opal_show_title "Header file tests"
|
||||
|
||||
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
|
||||
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \
|
||||
dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
|
||||
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
|
||||
poll.h pthread.h pty.h pwd.h sched.h \
|
||||
strings.h stropts.h linux/ethtool.h linux/sockios.h \
|
||||
|
@ -32,6 +32,9 @@
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_ENDIAN_H
|
||||
#include <endian.h>
|
||||
#endif
|
||||
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/constants.h"
|
||||
@ -2155,7 +2158,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
|
||||
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
|
||||
{
|
||||
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
|
||||
char *sig=NULL, *arch=NULL;
|
||||
char *sig=NULL, *arch = NULL, *endian;
|
||||
hwloc_obj_t obj;
|
||||
unsigned i;
|
||||
|
||||
@ -2175,14 +2178,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL == arch) {
|
||||
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH",
|
||||
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
|
||||
} else {
|
||||
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
|
||||
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
|
||||
arch = "unknown";
|
||||
}
|
||||
|
||||
#ifdef __BYTE_ORDER
|
||||
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
endian = "le";
|
||||
#else
|
||||
endian = "be";
|
||||
#endif
|
||||
#else
|
||||
endian = "unknown";
|
||||
#endif
|
||||
|
||||
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
|
||||
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
|
||||
return sig;
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
|
||||
no active out-of-band transports and therefore cannot execute this call.
|
||||
Please check to see if you have the "oob" MCA parameter set and ensure
|
||||
that it is either unset or at least includes the tcp transport.
|
||||
#
|
||||
[multi-endian]
|
||||
Open MPI does not currently support multi-endian operations. We have
|
||||
detected that the following node differs in endianness:
|
||||
|
||||
|
||||
Nodename: %s
|
||||
Endian: %s
|
||||
Local endian: %s
|
||||
|
||||
Please correct the situation and try again.
|
||||
|
@ -1058,12 +1058,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
||||
orte_daemon_cmd_flag_t cmd;
|
||||
int32_t flag;
|
||||
opal_value_t *kv;
|
||||
char *myendian;
|
||||
|
||||
/* get the daemon job, if necessary */
|
||||
if (NULL == jdatorted) {
|
||||
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
|
||||
}
|
||||
|
||||
/* get my endianness */
|
||||
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
|
||||
if (NULL == t) {
|
||||
/* should never happen */
|
||||
myendian = "unknown";
|
||||
} else {
|
||||
myendian = strrchr(t->sig, ':');
|
||||
++myendian;
|
||||
}
|
||||
|
||||
/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
|
||||
idx = 1;
|
||||
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
|
||||
@ -1263,8 +1274,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
||||
}
|
||||
free(sig);
|
||||
break;
|
||||
} else {
|
||||
/* check if the difference is due to the endianness */
|
||||
ptr = strrchr(sig, ':');
|
||||
++ptr;
|
||||
if (0 != strcmp(ptr, myendian)) {
|
||||
/* we don't currently handle multi-endian operations in the
|
||||
* MPI support */
|
||||
orte_show_help("help-plm-base", "multi-endian", true,
|
||||
nodename, ptr, myendian);
|
||||
orted_failed_launch = true;
|
||||
if (NULL != topo) {
|
||||
hwloc_topology_destroy(topo);
|
||||
}
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
/* nope - save the signature and request the complete topology from that node */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user