Repair rsh/ssh tree spawn
Repair rsh/ssh tree spawn by unpacking and updating the nidmap in remote_spawn. Add more specific error messages so the cause of a messaging problem is a little clearer. Remove some stale code. Ensure we stop trying to send a message after a few times. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
f4a86904c4
Коммит
d672fad849
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
|
||||
connection to the node, or possibly an internal failure of
|
||||
the daemon itself. We cannot recover from this failure, and
|
||||
therefore will terminate the job.
|
||||
#
|
||||
[no-path]
|
||||
ORTE does not know how to route a message to the specified daemon
|
||||
located on the indicated node:
|
||||
|
||||
my node: %s
|
||||
target node: %s
|
||||
|
||||
This is usually an internal programming error that should be
|
||||
reported to the developers. In the meantime, a workaround may
|
||||
be to set the MCA param routed=direct on the command line or
|
||||
in your environment. We apologize for the problem.
|
||||
#
|
||||
[no-connect]
|
||||
ORTE is unable to establish a communication connection to the
|
||||
specified daemon located on the indicated node:
|
||||
|
||||
my node: %s
|
||||
target node: %s
|
||||
|
||||
This is usually due to a lack of common network interfaces and/or
|
||||
no route found between them. Please check network connectivity (including
|
||||
firewalls and network routing requirements). If these look okay,
|
||||
then it could be an internal programming error that should be
|
||||
reported to the developers. In the meantime, a workaround may
|
||||
be to set the MCA param routed=direct on the command line or
|
||||
in your environment.
|
||||
|
@ -9,7 +9,7 @@
|
||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:hnp: no message path to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
orte_show_help("help-errmgr-base.txt", "no-path", true,
|
||||
orte_process_info.nodename, pptr->node->name);
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:hnp: cannot connect to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
orte_show_help("help-errmgr-base.txt", "no-connect", true,
|
||||
orte_process_info.nodename, pptr->node->name);
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* shouldn't get this, but terminate job if required */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(cd);
|
||||
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s oob:base:send to target %s",
|
||||
"%s oob:base:send to target %s - %u attempt",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&msg->dst));
|
||||
ORTE_NAME_PRINT(&msg->dst), msg->retries);
|
||||
|
||||
/* don't try forever - if we have exceeded the number of retries,
|
||||
* then report this message as undeliverable even if someone continues
|
||||
* to think they could reach it */
|
||||
if (orte_rml_base.max_retries <= msg->retries) {
|
||||
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
|
||||
ORTE_RML_SEND_COMPLETE(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
/* check if we have this peer in our hash table */
|
||||
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
||||
|
@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
|
||||
const struct sockaddr *addr);
|
||||
static void ping(const orte_process_name_t *proc);
|
||||
static void send_nb(orte_rml_send_t *msg);
|
||||
static void resend(struct mca_oob_tcp_msg_error_t *mop);
|
||||
static void ft_event(int state);
|
||||
|
||||
mca_oob_tcp_module_t mca_oob_tcp_module = {
|
||||
.accept_connection = accept_connection,
|
||||
.ping = ping,
|
||||
.send_nb = send_nb,
|
||||
.resend = resend,
|
||||
.ft_event = ft_event
|
||||
};
|
||||
|
||||
@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
|
||||
}
|
||||
}
|
||||
|
||||
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
|
||||
{
|
||||
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
|
||||
mca_oob_tcp_peer_t *peer;
|
||||
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp processing resend to peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&mp->hop));
|
||||
|
||||
/* do we know this peer? */
|
||||
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
|
||||
/* push this back to the component so it can try
|
||||
* another module within this transport. If no
|
||||
* module can be found, the component can push back
|
||||
* to the framework so another component can try
|
||||
*/
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s:[%s:%d] peer %s unknown",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
__FILE__, __LINE__,
|
||||
ORTE_NAME_PRINT(&mp->hop));
|
||||
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
|
||||
return;
|
||||
}
|
||||
|
||||
/* should be impossible, but...has this peer had a progress thread assigned yet? */
|
||||
if (NULL == peer->ev_base) {
|
||||
/* nope - assign one */
|
||||
ORTE_OOB_TCP_NEXT_BASE(peer);
|
||||
}
|
||||
|
||||
/* add the msg to this peer's send queue */
|
||||
if (MCA_OOB_TCP_CONNECTED == peer->state) {
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:resend: already connected to %s - queueing for send",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer->name));
|
||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
|
||||
return;
|
||||
}
|
||||
|
||||
if (MCA_OOB_TCP_CONNECTING != peer->state &&
|
||||
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
|
||||
/* add the message to the queue for sending after the
|
||||
* connection is formed
|
||||
*/
|
||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
|
||||
/* we have to initiate the connection - again, we do not
|
||||
* want to block while the connection is created.
|
||||
* So throw us into an event that will create
|
||||
* the connection via a mini-state-machine :-)
|
||||
*/
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:send_nb: initiating connection to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer->name));
|
||||
peer->state = MCA_OOB_TCP_CONNECTING;
|
||||
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Event callback when there is data available on the registered
|
||||
* socket to recv. This is called for the listen sockets to accept an
|
||||
|
@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
|
||||
const struct sockaddr *addr);
|
||||
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
|
||||
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
|
||||
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
|
||||
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
|
||||
|
||||
typedef struct {
|
||||
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
|
||||
mca_oob_tcp_module_ping_fn_t ping;
|
||||
mca_oob_tcp_module_send_nb_fn_t send_nb;
|
||||
mca_oob_tcp_module_resend_nb_fn_t resend;
|
||||
mca_oob_tcp_module_ft_event_fn_t ft_event;
|
||||
} mca_oob_tcp_module_t;
|
||||
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
|
||||
|
@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
||||
/* report the error back to the OOB and let it try other components
|
||||
* or declare a problem
|
||||
*/
|
||||
if (!orte_finalizing && !orte_abnormal_term_ordered) {
|
||||
/* if this was a lifeline, then alert */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
|
||||
} else {
|
||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
|
||||
}
|
||||
}
|
||||
mop->rmsg->retries++;
|
||||
/* activate the OOB send state */
|
||||
ORTE_OOB_SEND(mop->rmsg);
|
||||
|
||||
OBJ_RELEASE(mop);
|
||||
}
|
||||
@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
|
||||
*/
|
||||
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
|
||||
snd = OBJ_NEW(orte_rml_send_t);
|
||||
snd->retries = mop->rmsg->retries + 1;
|
||||
snd->dst = mop->snd->hdr.dst;
|
||||
snd->origin = mop->snd->hdr.origin;
|
||||
snd->tag = mop->snd->hdr.tag;
|
||||
@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pop->peer));
|
||||
|
||||
/* if this was a lifeline, then alert */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
|
||||
} else {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
|
||||
}
|
||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
|
||||
OBJ_RELEASE(pop);
|
||||
}
|
||||
|
||||
|
@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
|
||||
do { \
|
||||
mca_oob_tcp_msg_error_t *mp; \
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
|
||||
"%s:[%s:%d] post resend to %s", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
ORTE_NAME_PRINT(&((mop)->hop))); \
|
||||
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
|
||||
mp->snd = (mop)->snd; \
|
||||
mp->hop = (mop)->hop; \
|
||||
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
|
||||
OPAL_EV_WRITE, (cbfunc), mp); \
|
||||
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
||||
do { \
|
||||
mca_oob_tcp_msg_error_t *mop; \
|
||||
@ -320,7 +303,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
||||
mop->rmsg = (r); \
|
||||
mop->hop.jobid = (h)->jobid; \
|
||||
mop->hop.vpid = (h)->vpid; \
|
||||
/* this goes to the OOB framework, so use that event base */ \
|
||||
/* this goes to the component, so use the framework \
|
||||
* event base */ \
|
||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||
OPAL_EV_WRITE, (c), mop); \
|
||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -81,7 +81,10 @@ typedef uint32_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
|
||||
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
|
||||
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
|
||||
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* unable to send a message */
|
||||
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* connection to lifeline lost */
|
||||
#define ORTE_PROC_STATE_NO_PATH_TO_TARGET (ORTE_PROC_STATE_ERROR + 16) /* no path for communicating to target peer */
|
||||
#define ORTE_PROC_STATE_FAILED_TO_CONNECT (ORTE_PROC_STATE_ERROR + 17) /* unable to connect to target peer */
|
||||
#define ORTE_PROC_STATE_PEER_UNKNOWN (ORTE_PROC_STATE_ERROR + 18) /* unknown peer */
|
||||
|
||||
/* Define a boundary so that external developers
|
||||
* have a starting point for defining their own
|
||||
|
@ -806,6 +806,12 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||
orte_routed.get_routing_list(rtmod, &coll);
|
||||
|
||||
/* extract and update the daemon map */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* if I have no children, just return */
|
||||
if (0 == opal_list_get_size(&coll)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
|
@ -5,7 +5,7 @@
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -231,9 +231,15 @@ void orte_rml_send_callback(int status, orte_process_name_t *peer,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(peer), tag,
|
||||
ORTE_ERROR_NAME(status));
|
||||
if (ORTE_ERR_NO_PATH_TO_TARGET == status) {
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_NO_PATH_TO_TARGET);
|
||||
} else if (ORTE_ERR_ADDRESSEE_UNKNOWN == status) {
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_PEER_UNKNOWN);
|
||||
} else {
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void orte_rml_recv_callback(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
|
@ -1,2 +1 @@
|
||||
anandhis
|
||||
rhc
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -434,6 +434,12 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
return "UNABLE TO SEND MSG";
|
||||
case ORTE_PROC_STATE_LIFELINE_LOST:
|
||||
return "LIFELINE LOST";
|
||||
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||
return "NO PATH TO TARGET";
|
||||
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||
return "FAILED TO CONNECT";
|
||||
case ORTE_PROC_STATE_PEER_UNKNOWN:
|
||||
return "PEER UNKNOWN";
|
||||
case ORTE_PROC_STATE_ANY:
|
||||
return "ANY";
|
||||
default:
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user