1
1
Repair rsh/ssh tree spawn by unpacking and updating the nidmap in remote_spawn.

Add more specific error messages so the cause of a messaging problem is a little clearer. Remove some stale code. Ensure we stop trying to send a message after a few times.

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-01-27 11:35:00 -08:00
родитель f4a86904c4
Коммит d672fad849
12 изменённых файлов: 117 добавлений и 110 удалений

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved. # Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and the daemon itself. We cannot recover from this failure, and
therefore will terminate the job. therefore will terminate the job.
#
[no-path]
ORTE does not know how to route a message to the specified daemon
located on the indicated node:
my node: %s
target node: %s
This is usually an internal programming error that should be
reported to the developers. In the meantime, a workaround may
be to set the MCA param routed=direct on the command line or
in your environment. We apologize for the problem.
#
[no-connect]
ORTE is unable to establish a communication connection to the
specified daemon located on the indicated node:
my node: %s
target node: %s
This is usually due to a lack of common network interfaces and/or
no route found between them. Please check network connectivity (including
firewalls and network routing requirements). If these look okay,
then it could be an internal programming error that should be
reported to the developers. In the meantime, a workaround may
be to set the MCA param routed=direct on the command line or
in your environment.

Просмотреть файл

@ -9,7 +9,7 @@
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
} }
break; break;
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:hnp: no message path to proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
orte_show_help("help-errmgr-base.txt", "no-path", true,
orte_process_info.nodename, pptr->node->name);
/* if this proc is one of my daemons, then we are truly
* hosed - so just exit out
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
break;
}
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* abnormal termination - abort, but only do it once
* to avoid creating a lot of confusion */
default_hnp_abort(jdata);
}
break;
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:hnp: cannot connect to proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
orte_show_help("help-errmgr-base.txt", "no-connect", true,
orte_process_info.nodename, pptr->node->name);
/* if this proc is one of my daemons, then we are truly
* hosed - so just exit out
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
break;
}
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* abnormal termination - abort, but only do it once
* to avoid creating a lot of confusion */
default_hnp_abort(jdata);
}
break;
default: default:
/* shouldn't get this, but terminate job if required */ /* shouldn't get this, but terminate job if required */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,

Просмотреть файл

@ -2,7 +2,7 @@
/* /*
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
OBJ_RELEASE(cd); OBJ_RELEASE(cd);
opal_output_verbose(5, orte_oob_base_framework.framework_output, opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:base:send to target %s", "%s oob:base:send to target %s - %u attempt",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->dst)); ORTE_NAME_PRINT(&msg->dst), msg->retries);
/* don't try forever - if we have exceeded the number of retries,
* then report this message as undeliverable even if someone continues
* to think they could reach it */
if (orte_rml_base.max_retries <= msg->retries) {
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
ORTE_RML_SEND_COMPLETE(msg);
return;
}
/* check if we have this peer in our hash table */ /* check if we have this peer in our hash table */
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t)); memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));

Просмотреть файл

@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
const struct sockaddr *addr); const struct sockaddr *addr);
static void ping(const orte_process_name_t *proc); static void ping(const orte_process_name_t *proc);
static void send_nb(orte_rml_send_t *msg); static void send_nb(orte_rml_send_t *msg);
static void resend(struct mca_oob_tcp_msg_error_t *mop);
static void ft_event(int state); static void ft_event(int state);
mca_oob_tcp_module_t mca_oob_tcp_module = { mca_oob_tcp_module_t mca_oob_tcp_module = {
.accept_connection = accept_connection, .accept_connection = accept_connection,
.ping = ping, .ping = ping,
.send_nb = send_nb, .send_nb = send_nb,
.resend = resend,
.ft_event = ft_event .ft_event = ft_event
}; };
@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
} }
} }
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
{
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
mca_oob_tcp_peer_t *peer;
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:tcp processing resend to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mp->hop));
/* do we know this peer? */
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
/* push this back to the component so it can try
* another module within this transport. If no
* module can be found, the component can push back
* to the framework so another component can try
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] peer %s unknown",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&mp->hop));
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
return;
}
/* should be impossible, but...has this peer had a progress thread assigned yet? */
if (NULL == peer->ev_base) {
/* nope - assign one */
ORTE_OOB_TCP_NEXT_BASE(peer);
}
/* add the msg to this peer's send queue */
if (MCA_OOB_TCP_CONNECTED == peer->state) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s tcp:resend: already connected to %s - queueing for send",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name));
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
return;
}
if (MCA_OOB_TCP_CONNECTING != peer->state &&
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
/* add the message to the queue for sending after the
* connection is formed
*/
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
/* we have to initiate the connection - again, we do not
* want to block while the connection is created.
* So throw us into an event that will create
* the connection via a mini-state-machine :-)
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s tcp:send_nb: initiating connection to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name));
peer->state = MCA_OOB_TCP_CONNECTING;
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
}
}
/* /*
* Event callback when there is data available on the registered * Event callback when there is data available on the registered
* socket to recv. This is called for the listen sockets to accept an * socket to recv. This is called for the listen sockets to accept an

Просмотреть файл

@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
const struct sockaddr *addr); const struct sockaddr *addr);
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc); typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg); typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state); typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
typedef struct { typedef struct {
mca_oob_tcp_module_accept_connection_fn_t accept_connection; mca_oob_tcp_module_accept_connection_fn_t accept_connection;
mca_oob_tcp_module_ping_fn_t ping; mca_oob_tcp_module_ping_fn_t ping;
mca_oob_tcp_module_send_nb_fn_t send_nb; mca_oob_tcp_module_send_nb_fn_t send_nb;
mca_oob_tcp_module_resend_nb_fn_t resend;
mca_oob_tcp_module_ft_event_fn_t ft_event; mca_oob_tcp_module_ft_event_fn_t ft_event;
} mca_oob_tcp_module_t; } mca_oob_tcp_module_t;
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module; ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;

Просмотреть файл

@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
/* report the error back to the OOB and let it try other components /* report the error back to the OOB and let it try other components
* or declare a problem * or declare a problem
*/ */
if (!orte_finalizing && !orte_abnormal_term_ordered) { mop->rmsg->retries++;
/* if this was a lifeline, then alert */ /* activate the OOB send state */
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) { ORTE_OOB_SEND(mop->rmsg);
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
}
}
OBJ_RELEASE(mop); OBJ_RELEASE(mop);
} }
@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
*/ */
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr); MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
snd = OBJ_NEW(orte_rml_send_t); snd = OBJ_NEW(orte_rml_send_t);
snd->retries = mop->rmsg->retries + 1;
snd->dst = mop->snd->hdr.dst; snd->dst = mop->snd->hdr.dst;
snd->origin = mop->snd->hdr.origin; snd->origin = mop->snd->hdr.origin;
snd->tag = mop->snd->hdr.tag; snd->tag = mop->snd->hdr.tag;
@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pop->peer)); ORTE_NAME_PRINT(&pop->peer));
/* if this was a lifeline, then alert */ ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
}
OBJ_RELEASE(pop); OBJ_RELEASE(pop);
} }

Просмотреть файл

@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
} while(0); } while(0);
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
do { \
mca_oob_tcp_msg_error_t *mp; \
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
"%s:[%s:%d] post resend to %s", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, \
ORTE_NAME_PRINT(&((mop)->hop))); \
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
mp->snd = (mop)->snd; \
mp->hop = (mop)->hop; \
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
OPAL_EV_WRITE, (cbfunc), mp); \
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
} while(0);
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \ #define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
do { \ do { \
mca_oob_tcp_msg_error_t *mop; \ mca_oob_tcp_msg_error_t *mop; \
@ -320,8 +303,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
mop->rmsg = (r); \ mop->rmsg = (r); \
mop->hop.jobid = (h)->jobid; \ mop->hop.jobid = (h)->jobid; \
mop->hop.vpid = (h)->vpid; \ mop->hop.vpid = (h)->vpid; \
/* this goes to the OOB framework, so use that event base */ \ /* this goes to the component, so use the framework \
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \ * event base */ \
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
OPAL_EV_WRITE, (c), mop); \ OPAL_EV_WRITE, (c), mop); \
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \ opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -81,7 +81,10 @@ typedef uint32_t orte_proc_state_t;
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */ #define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */ #define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */ #define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* unable to send a message */ #define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* connection to lifeline lost */
#define ORTE_PROC_STATE_NO_PATH_TO_TARGET (ORTE_PROC_STATE_ERROR + 16) /* no path for communicating to target peer */
#define ORTE_PROC_STATE_FAILED_TO_CONNECT (ORTE_PROC_STATE_ERROR + 17) /* unable to connect to target peer */
#define ORTE_PROC_STATE_PEER_UNKNOWN (ORTE_PROC_STATE_ERROR + 18) /* unknown peer */
/* Define a boundary so that external developers /* Define a boundary so that external developers
* have a starting point for defining their own * have a starting point for defining their own

Просмотреть файл

@ -806,6 +806,12 @@ static int remote_spawn(opal_buffer_t *launch)
OBJ_CONSTRUCT(&coll, opal_list_t); OBJ_CONSTRUCT(&coll, opal_list_t);
orte_routed.get_routing_list(rtmod, &coll); orte_routed.get_routing_list(rtmod, &coll);
/* extract and update the daemon map */
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* if I have no children, just return */ /* if I have no children, just return */
if (0 == opal_list_get_size(&coll)) { if (0 == opal_list_get_size(&coll)) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science * Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -231,7 +231,13 @@ void orte_rml_send_callback(int status, orte_process_name_t *peer,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), tag, ORTE_NAME_PRINT(peer), tag,
ORTE_ERROR_NAME(status)); ORTE_ERROR_NAME(status));
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG); if (ORTE_ERR_NO_PATH_TO_TARGET == status) {
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_NO_PATH_TO_TARGET);
} else if (ORTE_ERR_ADDRESSEE_UNKNOWN == status) {
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_PEER_UNKNOWN);
} else {
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
}
} }
} }

Просмотреть файл

@ -1,2 +1 @@
anandhis anandhis
rhc

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -434,6 +434,12 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
return "UNABLE TO SEND MSG"; return "UNABLE TO SEND MSG";
case ORTE_PROC_STATE_LIFELINE_LOST: case ORTE_PROC_STATE_LIFELINE_LOST:
return "LIFELINE LOST"; return "LIFELINE LOST";
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
return "NO PATH TO TARGET";
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
return "FAILED TO CONNECT";
case ORTE_PROC_STATE_PEER_UNKNOWN:
return "PEER UNKNOWN";
case ORTE_PROC_STATE_ANY: case ORTE_PROC_STATE_ANY:
return "ANY"; return "ANY";
default: default: