diff --git a/opal/util/fd.c b/opal/util/fd.c index 398f93276e..5dd2c54d5c 100644 --- a/opal/util/fd.c +++ b/opal/util/fd.c @@ -24,6 +24,9 @@ #ifdef HAVE_ARPA_INET_H #include #endif +#ifdef HAVE_NETINET_IN_H +#include +#endif #ifdef HAVE_UNISTD_H #include #endif diff --git a/orte/mca/oob/tcp/help-oob-tcp.txt b/orte/mca/oob/tcp/help-oob-tcp.txt index fd9dfdfde8..e5562ac470 100644 --- a/orte/mca/oob/tcp/help-oob-tcp.txt +++ b/orte/mca/oob/tcp/help-oob-tcp.txt @@ -11,7 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2014-2017 Intel, Inc. All rights reserved. -# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -106,10 +106,29 @@ levels. Remote host: %s Remote port: %d - The connection was rejected. # [static-fwd] Static ports were requested while orte_fwd_mpirun_port was set. Both options cannot be simultaneously set. Please either set orte_fwd_mpirun_port=false or remove any static port directives. +# +[version mismatch] +Open MPI detected a mismatch in versions between two processes. This +typically means that you executed "mpirun" (or "mpiexec") from one +version of Open MPI on on node, but your default path on one of the +other nodes upon which you launched found a different version of Open +MPI. + +Open MPI only supports running exactly the same version between all +processes in a single job. + +This will almost certainly cause unpredictable behavior, and may end +up aborting your job. + + Local host: %s + Local process name: %s + Local Open MPI version: %s + Peer host: %s + Peer process name: %s + Peer Open MPI version: %s diff --git a/orte/mca/oob/tcp/oob_tcp_connection.c b/orte/mca/oob/tcp/oob_tcp_connection.c index 14f606640f..d2c7d3a8be 100644 --- a/orte/mca/oob/tcp/oob_tcp_connection.c +++ b/orte/mca/oob/tcp/oob_tcp_connection.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science @@ -58,6 +58,7 @@ #include "opal/util/net.h" #include "opal/util/fd.h" #include "opal/util/error.h" +#include "opal/util/show_help.h" #include "opal/class/opal_hash_table.h" #include "opal/mca/event/event.h" @@ -701,6 +702,7 @@ static bool retry(mca_oob_tcp_peer_t* peer, int sd, bool fatal) } } + int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr, int sd, mca_oob_tcp_hdr_t *dhdr) { @@ -890,11 +892,15 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr, version = (char*)((char*)msg + offset); offset += strlen(version) + 1; if (0 != strcmp(version, orte_version_string)) { - opal_output(0, "%s tcp_peer_recv_connect_ack: " - "received different version from %s: %s instead of %s\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->name)), - version, orte_version_string); + opal_show_help("help-oob-tcp.txt", "version mismatch", + true, + opal_process_info.nodename, + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_version_string, + opal_fd_get_peer_name(peer->sd), + ORTE_NAME_PRINT(&(peer->name)), + version); + peer->state = MCA_OOB_TCP_FAILED; mca_oob_tcp_peer_close(peer); free(msg);