Repair rsh/ssh tree spawn
Repair rsh/ssh tree spawn by unpacking and updating the nidmap in remote_spawn. Add more specific error messages so the cause of a messaging problem is a little clearer. Remove some stale code. Ensure we stop trying to send a message after a few times. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
f4a86904c4
Коммит
d672fad849
@ -10,7 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
|
|||||||
connection to the node, or possibly an internal failure of
|
connection to the node, or possibly an internal failure of
|
||||||
the daemon itself. We cannot recover from this failure, and
|
the daemon itself. We cannot recover from this failure, and
|
||||||
therefore will terminate the job.
|
therefore will terminate the job.
|
||||||
|
#
|
||||||
|
[no-path]
|
||||||
|
ORTE does not know how to route a message to the specified daemon
|
||||||
|
located on the indicated node:
|
||||||
|
|
||||||
|
my node: %s
|
||||||
|
target node: %s
|
||||||
|
|
||||||
|
This is usually an internal programming error that should be
|
||||||
|
reported to the developers. In the meantime, a workaround may
|
||||||
|
be to set the MCA param routed=direct on the command line or
|
||||||
|
in your environment. We apologize for the problem.
|
||||||
|
#
|
||||||
|
[no-connect]
|
||||||
|
ORTE is unable to establish a communication connection to the
|
||||||
|
specified daemon located on the indicated node:
|
||||||
|
|
||||||
|
my node: %s
|
||||||
|
target node: %s
|
||||||
|
|
||||||
|
This is usually due to a lack of common network interfaces and/or
|
||||||
|
no route found between them. Please check network connectivity (including
|
||||||
|
firewalls and network routing requirements). If these look okay,
|
||||||
|
then it could be an internal programming error that should be
|
||||||
|
reported to the developers. In the meantime, a workaround may
|
||||||
|
be to set the MCA param routed=direct on the command line or
|
||||||
|
in your environment.
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||||
|
"%s errmgr:hnp: no message path to proc %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(proc)));
|
||||||
|
orte_show_help("help-errmgr-base.txt", "no-path", true,
|
||||||
|
orte_process_info.nodename, pptr->node->name);
|
||||||
|
/* if this proc is one of my daemons, then we are truly
|
||||||
|
* hosed - so just exit out
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||||
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||||
|
/* abnormal termination - abort, but only do it once
|
||||||
|
* to avoid creating a lot of confusion */
|
||||||
|
default_hnp_abort(jdata);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||||
|
"%s errmgr:hnp: cannot connect to proc %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(proc)));
|
||||||
|
orte_show_help("help-errmgr-base.txt", "no-connect", true,
|
||||||
|
orte_process_info.nodename, pptr->node->name);
|
||||||
|
/* if this proc is one of my daemons, then we are truly
|
||||||
|
* hosed - so just exit out
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||||
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||||
|
/* abnormal termination - abort, but only do it once
|
||||||
|
* to avoid creating a lot of confusion */
|
||||||
|
default_hnp_abort(jdata);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* shouldn't get this, but terminate job if required */
|
/* shouldn't get this, but terminate job if required */
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(cd);
|
OBJ_RELEASE(cd);
|
||||||
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||||
"%s oob:base:send to target %s",
|
"%s oob:base:send to target %s - %u attempt",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&msg->dst));
|
ORTE_NAME_PRINT(&msg->dst), msg->retries);
|
||||||
|
|
||||||
|
/* don't try forever - if we have exceeded the number of retries,
|
||||||
|
* then report this message as undeliverable even if someone continues
|
||||||
|
* to think they could reach it */
|
||||||
|
if (orte_rml_base.max_retries <= msg->retries) {
|
||||||
|
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
|
||||||
|
ORTE_RML_SEND_COMPLETE(msg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* check if we have this peer in our hash table */
|
/* check if we have this peer in our hash table */
|
||||||
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
||||||
|
@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
|
|||||||
const struct sockaddr *addr);
|
const struct sockaddr *addr);
|
||||||
static void ping(const orte_process_name_t *proc);
|
static void ping(const orte_process_name_t *proc);
|
||||||
static void send_nb(orte_rml_send_t *msg);
|
static void send_nb(orte_rml_send_t *msg);
|
||||||
static void resend(struct mca_oob_tcp_msg_error_t *mop);
|
|
||||||
static void ft_event(int state);
|
static void ft_event(int state);
|
||||||
|
|
||||||
mca_oob_tcp_module_t mca_oob_tcp_module = {
|
mca_oob_tcp_module_t mca_oob_tcp_module = {
|
||||||
.accept_connection = accept_connection,
|
.accept_connection = accept_connection,
|
||||||
.ping = ping,
|
.ping = ping,
|
||||||
.send_nb = send_nb,
|
.send_nb = send_nb,
|
||||||
.resend = resend,
|
|
||||||
.ft_event = ft_event
|
.ft_event = ft_event
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
|
|
||||||
{
|
|
||||||
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
|
|
||||||
mca_oob_tcp_peer_t *peer;
|
|
||||||
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s:tcp processing resend to peer %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&mp->hop));
|
|
||||||
|
|
||||||
/* do we know this peer? */
|
|
||||||
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
|
|
||||||
/* push this back to the component so it can try
|
|
||||||
* another module within this transport. If no
|
|
||||||
* module can be found, the component can push back
|
|
||||||
* to the framework so another component can try
|
|
||||||
*/
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s:[%s:%d] peer %s unknown",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
__FILE__, __LINE__,
|
|
||||||
ORTE_NAME_PRINT(&mp->hop));
|
|
||||||
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* should be impossible, but...has this peer had a progress thread assigned yet? */
|
|
||||||
if (NULL == peer->ev_base) {
|
|
||||||
/* nope - assign one */
|
|
||||||
ORTE_OOB_TCP_NEXT_BASE(peer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* add the msg to this peer's send queue */
|
|
||||||
if (MCA_OOB_TCP_CONNECTED == peer->state) {
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s tcp:resend: already connected to %s - queueing for send",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&peer->name));
|
|
||||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (MCA_OOB_TCP_CONNECTING != peer->state &&
|
|
||||||
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
|
|
||||||
/* add the message to the queue for sending after the
|
|
||||||
* connection is formed
|
|
||||||
*/
|
|
||||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
|
|
||||||
/* we have to initiate the connection - again, we do not
|
|
||||||
* want to block while the connection is created.
|
|
||||||
* So throw us into an event that will create
|
|
||||||
* the connection via a mini-state-machine :-)
|
|
||||||
*/
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s tcp:send_nb: initiating connection to %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&peer->name));
|
|
||||||
peer->state = MCA_OOB_TCP_CONNECTING;
|
|
||||||
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Event callback when there is data available on the registered
|
* Event callback when there is data available on the registered
|
||||||
* socket to recv. This is called for the listen sockets to accept an
|
* socket to recv. This is called for the listen sockets to accept an
|
||||||
|
@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
|
|||||||
const struct sockaddr *addr);
|
const struct sockaddr *addr);
|
||||||
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
|
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
|
||||||
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
|
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
|
||||||
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
|
|
||||||
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
|
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
|
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
|
||||||
mca_oob_tcp_module_ping_fn_t ping;
|
mca_oob_tcp_module_ping_fn_t ping;
|
||||||
mca_oob_tcp_module_send_nb_fn_t send_nb;
|
mca_oob_tcp_module_send_nb_fn_t send_nb;
|
||||||
mca_oob_tcp_module_resend_nb_fn_t resend;
|
|
||||||
mca_oob_tcp_module_ft_event_fn_t ft_event;
|
mca_oob_tcp_module_ft_event_fn_t ft_event;
|
||||||
} mca_oob_tcp_module_t;
|
} mca_oob_tcp_module_t;
|
||||||
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
|
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
|
||||||
|
@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
|||||||
/* report the error back to the OOB and let it try other components
|
/* report the error back to the OOB and let it try other components
|
||||||
* or declare a problem
|
* or declare a problem
|
||||||
*/
|
*/
|
||||||
if (!orte_finalizing && !orte_abnormal_term_ordered) {
|
mop->rmsg->retries++;
|
||||||
/* if this was a lifeline, then alert */
|
/* activate the OOB send state */
|
||||||
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) {
|
ORTE_OOB_SEND(mop->rmsg);
|
||||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
|
|
||||||
} else {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
OBJ_RELEASE(mop);
|
OBJ_RELEASE(mop);
|
||||||
}
|
}
|
||||||
@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
|
|||||||
*/
|
*/
|
||||||
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
|
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
|
||||||
snd = OBJ_NEW(orte_rml_send_t);
|
snd = OBJ_NEW(orte_rml_send_t);
|
||||||
|
snd->retries = mop->rmsg->retries + 1;
|
||||||
snd->dst = mop->snd->hdr.dst;
|
snd->dst = mop->snd->hdr.dst;
|
||||||
snd->origin = mop->snd->hdr.origin;
|
snd->origin = mop->snd->hdr.origin;
|
||||||
snd->tag = mop->snd->hdr.tag;
|
snd->tag = mop->snd->hdr.tag;
|
||||||
@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&pop->peer));
|
ORTE_NAME_PRINT(&pop->peer));
|
||||||
|
|
||||||
/* if this was a lifeline, then alert */
|
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
|
||||||
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
|
|
||||||
} else {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(pop);
|
OBJ_RELEASE(pop);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
|||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
|
|
||||||
do { \
|
|
||||||
mca_oob_tcp_msg_error_t *mp; \
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
|
|
||||||
"%s:[%s:%d] post resend to %s", \
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
||||||
__FILE__, __LINE__, \
|
|
||||||
ORTE_NAME_PRINT(&((mop)->hop))); \
|
|
||||||
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
|
|
||||||
mp->snd = (mop)->snd; \
|
|
||||||
mp->hop = (mop)->hop; \
|
|
||||||
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
|
|
||||||
OPAL_EV_WRITE, (cbfunc), mp); \
|
|
||||||
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
||||||
do { \
|
do { \
|
||||||
mca_oob_tcp_msg_error_t *mop; \
|
mca_oob_tcp_msg_error_t *mop; \
|
||||||
@ -320,8 +303,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
|||||||
mop->rmsg = (r); \
|
mop->rmsg = (r); \
|
||||||
mop->hop.jobid = (h)->jobid; \
|
mop->hop.jobid = (h)->jobid; \
|
||||||
mop->hop.vpid = (h)->vpid; \
|
mop->hop.vpid = (h)->vpid; \
|
||||||
/* this goes to the OOB framework, so use that event base */ \
|
/* this goes to the component, so use the framework \
|
||||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
* event base */ \
|
||||||
|
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||||
OPAL_EV_WRITE, (c), mop); \
|
OPAL_EV_WRITE, (c), mop); \
|
||||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -81,7 +81,10 @@ typedef uint32_t orte_proc_state_t;
|
|||||||
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
|
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
|
||||||
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
|
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
|
||||||
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
|
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
|
||||||
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* unable to send a message */
|
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* connection to lifeline lost */
|
||||||
|
#define ORTE_PROC_STATE_NO_PATH_TO_TARGET (ORTE_PROC_STATE_ERROR + 16) /* no path for communicating to target peer */
|
||||||
|
#define ORTE_PROC_STATE_FAILED_TO_CONNECT (ORTE_PROC_STATE_ERROR + 17) /* unable to connect to target peer */
|
||||||
|
#define ORTE_PROC_STATE_PEER_UNKNOWN (ORTE_PROC_STATE_ERROR + 18) /* unknown peer */
|
||||||
|
|
||||||
/* Define a boundary so that external developers
|
/* Define a boundary so that external developers
|
||||||
* have a starting point for defining their own
|
* have a starting point for defining their own
|
||||||
|
@ -806,6 +806,12 @@ static int remote_spawn(opal_buffer_t *launch)
|
|||||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||||
orte_routed.get_routing_list(rtmod, &coll);
|
orte_routed.get_routing_list(rtmod, &coll);
|
||||||
|
|
||||||
|
/* extract and update the daemon map */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
/* if I have no children, just return */
|
/* if I have no children, just return */
|
||||||
if (0 == opal_list_get_size(&coll)) {
|
if (0 == opal_list_get_size(&coll)) {
|
||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -231,7 +231,13 @@ void orte_rml_send_callback(int status, orte_process_name_t *peer,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(peer), tag,
|
ORTE_NAME_PRINT(peer), tag,
|
||||||
ORTE_ERROR_NAME(status));
|
ORTE_ERROR_NAME(status));
|
||||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
if (ORTE_ERR_NO_PATH_TO_TARGET == status) {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_NO_PATH_TO_TARGET);
|
||||||
|
} else if (ORTE_ERR_ADDRESSEE_UNKNOWN == status) {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_PEER_UNKNOWN);
|
||||||
|
} else {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,2 +1 @@
|
|||||||
anandhis
|
anandhis
|
||||||
rhc
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -434,6 +434,12 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
|||||||
return "UNABLE TO SEND MSG";
|
return "UNABLE TO SEND MSG";
|
||||||
case ORTE_PROC_STATE_LIFELINE_LOST:
|
case ORTE_PROC_STATE_LIFELINE_LOST:
|
||||||
return "LIFELINE LOST";
|
return "LIFELINE LOST";
|
||||||
|
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||||
|
return "NO PATH TO TARGET";
|
||||||
|
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||||
|
return "FAILED TO CONNECT";
|
||||||
|
case ORTE_PROC_STATE_PEER_UNKNOWN:
|
||||||
|
return "PEER UNKNOWN";
|
||||||
case ORTE_PROC_STATE_ANY:
|
case ORTE_PROC_STATE_ANY:
|
||||||
return "ANY";
|
return "ANY";
|
||||||
default:
|
default:
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user