diff --git a/configure.ac b/configure.ac index deb5a68031..6161929de3 100644 --- a/configure.ac +++ b/configure.ac @@ -588,7 +588,7 @@ AC_CACHE_SAVE opal_show_title "Header file tests" AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \ - dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \ + dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \ libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \ poll.h pthread.h pty.h pwd.h sched.h \ strings.h stropts.h linux/ethtool.h linux/sockios.h \ diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 5fe9b90e56..cd75ce6111 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -32,6 +32,9 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_ENDIAN_H +#include +#endif #include "opal/runtime/opal.h" #include "opal/constants.h" @@ -2155,7 +2158,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo) { int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt; - char *sig=NULL, *arch=NULL; + char *sig=NULL, *arch = NULL, *endian; hwloc_obj_t obj; unsigned i; @@ -2175,14 +2178,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo) break; } } - if (NULL == arch) { - asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH", - nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt); - } else { - asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s", - nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch); + arch = "unknown"; } + +#ifdef __BYTE_ORDER +#if __BYTE_ORDER == __LITTLE_ENDIAN + endian = "le"; +#else + endian = "be"; +#endif +#else + endian = "unknown"; +#endif + + asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s", + nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian); return sig; } diff --git a/orte/mca/plm/base/help-plm-base.txt b/orte/mca/plm/base/help-plm-base.txt index 8e13f92b36..bcc0912588 100644 --- a/orte/mca/plm/base/help-plm-base.txt +++ b/orte/mca/plm/base/help-plm-base.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has no active out-of-band transports and therefore cannot execute this call. Please check to see if you have the "oob" MCA parameter set and ensure that it is either unset or at least includes the tcp transport. +# +[multi-endian] +Open MPI does not currently support multi-endian operations. We have +detected that the following node differs in endianness: + + + Nodename: %s + Endian: %s + Local endian: %s + +Please correct the situation and try again. diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index a65a2f87ca..8a87ab3183 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1058,12 +1058,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, orte_daemon_cmd_flag_t cmd; int32_t flag; opal_value_t *kv; + char *myendian; /* get the daemon job, if necessary */ if (NULL == jdatorted) { jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } + /* get my endianness */ + t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0); + if (NULL == t) { + /* should never happen */ + myendian = "unknown"; + } else { + myendian = strrchr(t->sig, ':'); + ++myendian; + } + /* multiple daemons could be in this buffer, so unpack until we exhaust the data */ idx = 1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) { @@ -1263,8 +1274,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender, } free(sig); break; + } else { + /* check if the difference is due to the endianness */ + ptr = strrchr(sig, ':'); + ++ptr; + if (0 != strcmp(ptr, myendian)) { + /* we don't currently handle multi-endian operations in the + * MPI support */ + orte_show_help("help-plm-base", "multi-endian", true, + nodename, ptr, myendian); + orted_failed_launch = true; + if (NULL != topo) { + hwloc_topology_destroy(topo); + } + goto CLEANUP; + } } } + if (!found) { /* nope - save the signature and request the complete topology from that node */ OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,