Коммит
410befd255
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
|
||||
connection to the node, or possibly an internal failure of
|
||||
the daemon itself. We cannot recover from this failure, and
|
||||
therefore will terminate the job.
|
||||
#
|
||||
[no-path]
|
||||
ORTE does not know how to route a message to the specified daemon
|
||||
located on the indicated node:
|
||||
|
||||
my node: %s
|
||||
target node: %s
|
||||
|
||||
This is usually an internal programming error that should be
|
||||
reported to the developers. In the meantime, a workaround may
|
||||
be to set the MCA param routed=direct on the command line or
|
||||
in your environment. We apologize for the problem.
|
||||
#
|
||||
[no-connect]
|
||||
ORTE is unable to establish a communication connection to the
|
||||
specified daemon located on the indicated node:
|
||||
|
||||
my node: %s
|
||||
target node: %s
|
||||
|
||||
This is usually due to a lack of common network interfaces and/or
|
||||
no route found between them. Please check network connectivity (including
|
||||
firewalls and network routing requirements). If these look okay,
|
||||
then it could be an internal programming error that should be
|
||||
reported to the developers. In the meantime, a workaround may
|
||||
be to set the MCA param routed=direct on the command line or
|
||||
in your environment.
|
||||
|
@ -9,7 +9,7 @@
|
||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:hnp: no message path to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
orte_show_help("help-errmgr-base.txt", "no-path", true,
|
||||
orte_process_info.nodename, pptr->node->name);
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:hnp: cannot connect to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
orte_show_help("help-errmgr-base.txt", "no-connect", true,
|
||||
orte_process_info.nodename, pptr->node->name);
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* abnormal termination - abort, but only do it once
|
||||
* to avoid creating a lot of confusion */
|
||||
default_hnp_abort(jdata);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* shouldn't get this, but terminate job if required */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
|
@ -497,12 +497,6 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree if static ports were enabled - still
|
||||
* need to do it anyway just to initialize things
|
||||
*/
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
|
||||
/* if we are using static ports, then we need to setup
|
||||
* the daemon info so the RML can function properly
|
||||
* without requiring a wireup stage. This must be done
|
||||
@ -519,6 +513,12 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
error = "construct daemon map from static ports";
|
||||
goto error;
|
||||
}
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree if static ports were enabled
|
||||
*/
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
/* routing can be enabled */
|
||||
orte_routed_base.routing_enabled = true;
|
||||
}
|
||||
|
||||
/* Now provide a chance for the PLM
|
||||
|
@ -27,7 +27,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rml/base/base.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/util/compress.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -386,8 +386,14 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
||||
goto relay;
|
||||
}
|
||||
|
||||
/* update the routing plan */
|
||||
orte_routed.update_routing_plan(rtmod);
|
||||
if (!ORTE_PROC_IS_HNP) {
|
||||
/* update the routing plan - the HNP already did
|
||||
* it when it computed the VM, so don't waste time
|
||||
* re-doing it here */
|
||||
orte_routed.update_routing_plan(rtmod);
|
||||
}
|
||||
/* routing is now possible */
|
||||
orte_routed_base.routing_enabled = true;
|
||||
|
||||
/* see if we have wiring info as well */
|
||||
cnt=1;
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
||||
OBJ_RELEASE(cd);
|
||||
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s oob:base:send to target %s",
|
||||
"%s oob:base:send to target %s - %u attempt",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&msg->dst));
|
||||
ORTE_NAME_PRINT(&msg->dst), msg->retries);
|
||||
|
||||
/* don't try forever - if we have exceeded the number of retries,
|
||||
* then report this message as undeliverable even if someone continues
|
||||
* to think they could reach it */
|
||||
if (orte_rml_base.max_retries <= msg->retries) {
|
||||
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
|
||||
ORTE_RML_SEND_COMPLETE(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
/* check if we have this peer in our hash table */
|
||||
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
||||
|
@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
|
||||
const struct sockaddr *addr);
|
||||
static void ping(const orte_process_name_t *proc);
|
||||
static void send_nb(orte_rml_send_t *msg);
|
||||
static void resend(struct mca_oob_tcp_msg_error_t *mop);
|
||||
static void ft_event(int state);
|
||||
|
||||
mca_oob_tcp_module_t mca_oob_tcp_module = {
|
||||
.accept_connection = accept_connection,
|
||||
.ping = ping,
|
||||
.send_nb = send_nb,
|
||||
.resend = resend,
|
||||
.ft_event = ft_event
|
||||
};
|
||||
|
||||
@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
|
||||
}
|
||||
}
|
||||
|
||||
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
|
||||
{
|
||||
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
|
||||
mca_oob_tcp_peer_t *peer;
|
||||
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s:tcp processing resend to peer %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&mp->hop));
|
||||
|
||||
/* do we know this peer? */
|
||||
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
|
||||
/* push this back to the component so it can try
|
||||
* another module within this transport. If no
|
||||
* module can be found, the component can push back
|
||||
* to the framework so another component can try
|
||||
*/
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s:[%s:%d] peer %s unknown",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
__FILE__, __LINE__,
|
||||
ORTE_NAME_PRINT(&mp->hop));
|
||||
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
|
||||
return;
|
||||
}
|
||||
|
||||
/* should be impossible, but...has this peer had a progress thread assigned yet? */
|
||||
if (NULL == peer->ev_base) {
|
||||
/* nope - assign one */
|
||||
ORTE_OOB_TCP_NEXT_BASE(peer);
|
||||
}
|
||||
|
||||
/* add the msg to this peer's send queue */
|
||||
if (MCA_OOB_TCP_CONNECTED == peer->state) {
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:resend: already connected to %s - queueing for send",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer->name));
|
||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
|
||||
return;
|
||||
}
|
||||
|
||||
if (MCA_OOB_TCP_CONNECTING != peer->state &&
|
||||
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
|
||||
/* add the message to the queue for sending after the
|
||||
* connection is formed
|
||||
*/
|
||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
|
||||
/* we have to initiate the connection - again, we do not
|
||||
* want to block while the connection is created.
|
||||
* So throw us into an event that will create
|
||||
* the connection via a mini-state-machine :-)
|
||||
*/
|
||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
||||
"%s tcp:send_nb: initiating connection to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer->name));
|
||||
peer->state = MCA_OOB_TCP_CONNECTING;
|
||||
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Event callback when there is data available on the registered
|
||||
* socket to recv. This is called for the listen sockets to accept an
|
||||
|
@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
|
||||
const struct sockaddr *addr);
|
||||
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
|
||||
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
|
||||
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
|
||||
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
|
||||
|
||||
typedef struct {
|
||||
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
|
||||
mca_oob_tcp_module_ping_fn_t ping;
|
||||
mca_oob_tcp_module_send_nb_fn_t send_nb;
|
||||
mca_oob_tcp_module_resend_nb_fn_t resend;
|
||||
mca_oob_tcp_module_ft_event_fn_t ft_event;
|
||||
} mca_oob_tcp_module_t;
|
||||
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
|
||||
|
@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
||||
/* report the error back to the OOB and let it try other components
|
||||
* or declare a problem
|
||||
*/
|
||||
if (!orte_finalizing && !orte_abnormal_term_ordered) {
|
||||
/* if this was a lifeline, then alert */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
|
||||
} else {
|
||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
|
||||
}
|
||||
}
|
||||
mop->rmsg->retries++;
|
||||
/* activate the OOB send state */
|
||||
ORTE_OOB_SEND(mop->rmsg);
|
||||
|
||||
OBJ_RELEASE(mop);
|
||||
}
|
||||
@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
|
||||
*/
|
||||
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
|
||||
snd = OBJ_NEW(orte_rml_send_t);
|
||||
snd->retries = mop->rmsg->retries + 1;
|
||||
snd->dst = mop->snd->hdr.dst;
|
||||
snd->origin = mop->snd->hdr.origin;
|
||||
snd->tag = mop->snd->hdr.tag;
|
||||
@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pop->peer));
|
||||
|
||||
/* if this was a lifeline, then alert */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
|
||||
} else {
|
||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
|
||||
}
|
||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
|
||||
OBJ_RELEASE(pop);
|
||||
}
|
||||
|
||||
|
@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
|
||||
do { \
|
||||
mca_oob_tcp_msg_error_t *mp; \
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
|
||||
"%s:[%s:%d] post resend to %s", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, \
|
||||
ORTE_NAME_PRINT(&((mop)->hop))); \
|
||||
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
|
||||
mp->snd = (mop)->snd; \
|
||||
mp->hop = (mop)->hop; \
|
||||
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
|
||||
OPAL_EV_WRITE, (cbfunc), mp); \
|
||||
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
|
||||
} while(0);
|
||||
|
||||
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
||||
do { \
|
||||
mca_oob_tcp_msg_error_t *mop; \
|
||||
@ -320,8 +303,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
||||
mop->rmsg = (r); \
|
||||
mop->hop.jobid = (h)->jobid; \
|
||||
mop->hop.vpid = (h)->vpid; \
|
||||
/* this goes to the OOB framework, so use that event base */ \
|
||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||
/* this goes to the component, so use the framework \
|
||||
* event base */ \
|
||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||
OPAL_EV_WRITE, (c), mop); \
|
||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||
|
@ -410,15 +410,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
orte_process_info.num_procs = jdatorted->num_procs;
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* ensure all routing plans are up-to-date */
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
|
||||
/* If this job is being started by me, then there is nothing
|
||||
* further we need to do as any user directives (e.g., to tie
|
||||
* off IO to /dev/null) will have been included in the launch
|
||||
@ -2158,7 +2149,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* ensure all routing plans are up-to-date */
|
||||
/* ensure all routing plans are up-to-date - we need this
|
||||
* so we know how to tree-spawn and/or xcast info */
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -81,7 +81,10 @@ typedef uint32_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
|
||||
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
|
||||
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
|
||||
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* unable to send a message */
|
||||
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* connection to lifeline lost */
|
||||
#define ORTE_PROC_STATE_NO_PATH_TO_TARGET (ORTE_PROC_STATE_ERROR + 16) /* no path for communicating to target peer */
|
||||
#define ORTE_PROC_STATE_FAILED_TO_CONNECT (ORTE_PROC_STATE_ERROR + 17) /* unable to connect to target peer */
|
||||
#define ORTE_PROC_STATE_PEER_UNKNOWN (ORTE_PROC_STATE_ERROR + 18) /* unknown peer */
|
||||
|
||||
/* Define a boundary so that external developers
|
||||
* have a starting point for defining their own
|
||||
|
@ -801,6 +801,15 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* extract and update the daemon map */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* since we are tree-spawning, we need to update the routing plan */
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
|
||||
/* get the updated routing list */
|
||||
rtmod = orte_rml.get_routed(orte_coll_conduit);
|
||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||
|
@ -5,7 +5,7 @@
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -231,7 +231,13 @@ void orte_rml_send_callback(int status, orte_process_name_t *peer,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(peer), tag,
|
||||
ORTE_ERROR_NAME(status));
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
||||
if (ORTE_ERR_NO_PATH_TO_TARGET == status) {
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_NO_PATH_TO_TARGET);
|
||||
} else if (ORTE_ERR_ADDRESSEE_UNKNOWN == status) {
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_PEER_UNKNOWN);
|
||||
} else {
|
||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,2 +1 @@
|
||||
anandhis
|
||||
rhc
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2007-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -41,6 +41,7 @@ OBJ_CLASS_DECLARATION(orte_routed_base_active_t);
|
||||
|
||||
typedef struct {
|
||||
opal_list_t actives;
|
||||
bool routing_enabled;
|
||||
} orte_routed_base_t;
|
||||
ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base;
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -110,7 +110,7 @@ orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t
|
||||
orte_routed_base_active_t *active;
|
||||
|
||||
/* a NULL module corresponds to direct */
|
||||
if (NULL == module) {
|
||||
if (!orte_routed_base.routing_enabled || NULL == module) {
|
||||
return *target;
|
||||
}
|
||||
|
||||
@ -178,6 +178,7 @@ void orte_routed_base_update_routing_plan(char *module)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -58,6 +58,8 @@ static int orte_routed_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
/* setup our list of actives */
|
||||
OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t);
|
||||
/* start with routing DISABLED */
|
||||
orte_routed_base.routing_enabled = false;
|
||||
|
||||
/* Open up all available components */
|
||||
return mca_base_framework_components_open(&orte_routed_base_framework, flags);
|
||||
|
@ -49,7 +49,7 @@ static int orte_routed_debruijn_component_query(mca_base_module_t **module, int
|
||||
* systems, we will allow other options that have even fewer hops to
|
||||
* support wireup
|
||||
*/
|
||||
*priority = 70;
|
||||
*priority = 10;
|
||||
*module = (mca_base_module_t *) &orte_routed_debruijn_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -538,4 +538,3 @@ static int radix_ft_event(int state)
|
||||
return exit_status;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -69,7 +69,7 @@ static int orte_routed_radix_component_query(mca_base_module_t **module, int *pr
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
*priority = 50;
|
||||
*priority = 70;
|
||||
*module = (mca_base_module_t *) &orte_routed_radix_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -317,7 +317,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
/*
|
||||
* Send the request to termiante
|
||||
* Send the request to terminate
|
||||
*/
|
||||
if( num_new_procs > 0 ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
|
@ -344,6 +344,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init
|
||||
* we continue to have a reference count on them. So we have to finalize them twice...
|
||||
*/
|
||||
@ -647,7 +648,6 @@ int orte_daemon(int argc, char *argv[])
|
||||
/* If I have a parent, then save his contact info so
|
||||
* any messages we send can flow thru him.
|
||||
*/
|
||||
|
||||
orte_parent_uri = NULL;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
|
||||
"URI for the parent if tree launch is enabled.",
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -434,6 +434,12 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
return "UNABLE TO SEND MSG";
|
||||
case ORTE_PROC_STATE_LIFELINE_LOST:
|
||||
return "LIFELINE LOST";
|
||||
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||
return "NO PATH TO TARGET";
|
||||
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||
return "FAILED TO CONNECT";
|
||||
case ORTE_PROC_STATE_PEER_UNKNOWN:
|
||||
return "PEER UNKNOWN";
|
||||
case ORTE_PROC_STATE_ANY:
|
||||
return "ANY";
|
||||
default:
|
||||
|
@ -62,6 +62,7 @@
|
||||
#include "orte/mca/dfs/dfs.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -686,7 +687,11 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||
free(dids);
|
||||
|
||||
/* unpdate num procs */
|
||||
orte_process_info.num_procs = daemons->num_procs;
|
||||
if (orte_process_info.num_procs != daemons->num_procs) {
|
||||
orte_process_info.num_procs = daemons->num_procs;
|
||||
/* need to update the routing plan */
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
}
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user