1
1

oob/tcp: add show_help message about version mismatch

Be more explicit about version mismatch between ORTE processes.

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
Jeff Squyres 2017-09-12 07:54:48 -07:00
родитель 40afd525f8
Коммит 0f8077ace6
3 изменённых файлов: 36 добавлений и 8 удалений

Просмотреть файл

@ -24,6 +24,9 @@
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

Просмотреть файл

@ -11,7 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -106,10 +106,29 @@ levels.
Remote host: %s
Remote port: %d
The connection was rejected.
#
[static-fwd]
Static ports were requested while orte_fwd_mpirun_port was set.
Both options cannot be simultaneously set. Please either set
orte_fwd_mpirun_port=false or remove any static port directives.
#
[version mismatch]
Open MPI detected a mismatch in versions between two processes. This
typically means that you executed "mpirun" (or "mpiexec") from one
version of Open MPI on on node, but your default path on one of the
other nodes upon which you launched found a different version of Open
MPI.
Open MPI only supports running exactly the same version between all
processes in a single job.
This will almost certainly cause unpredictable behavior, and may end
up aborting your job.
Local host: %s
Local process name: %s
Local Open MPI version: %s
Peer host: %s
Peer process name: %s
Peer Open MPI version: %s

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
@ -58,6 +58,7 @@
#include "opal/util/net.h"
#include "opal/util/fd.h"
#include "opal/util/error.h"
#include "opal/util/show_help.h"
#include "opal/class/opal_hash_table.h"
#include "opal/mca/event/event.h"
@ -701,6 +702,7 @@ static bool retry(mca_oob_tcp_peer_t* peer, int sd, bool fatal)
}
}
int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
int sd, mca_oob_tcp_hdr_t *dhdr)
{
@ -890,11 +892,15 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
version = (char*)((char*)msg + offset);
offset += strlen(version) + 1;
if (0 != strcmp(version, orte_version_string)) {
opal_output(0, "%s tcp_peer_recv_connect_ack: "
"received different version from %s: %s instead of %s\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->name)),
version, orte_version_string);
opal_show_help("help-oob-tcp.txt", "version mismatch",
true,
opal_process_info.nodename,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_version_string,
opal_fd_get_peer_name(peer->sd),
ORTE_NAME_PRINT(&(peer->name)),
version);
peer->state = MCA_OOB_TCP_FAILED;
mca_oob_tcp_peer_close(peer);
free(msg);