Коммит
410befd255
@ -10,7 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
|
|||||||
connection to the node, or possibly an internal failure of
|
connection to the node, or possibly an internal failure of
|
||||||
the daemon itself. We cannot recover from this failure, and
|
the daemon itself. We cannot recover from this failure, and
|
||||||
therefore will terminate the job.
|
therefore will terminate the job.
|
||||||
|
#
|
||||||
|
[no-path]
|
||||||
|
ORTE does not know how to route a message to the specified daemon
|
||||||
|
located on the indicated node:
|
||||||
|
|
||||||
|
my node: %s
|
||||||
|
target node: %s
|
||||||
|
|
||||||
|
This is usually an internal programming error that should be
|
||||||
|
reported to the developers. In the meantime, a workaround may
|
||||||
|
be to set the MCA param routed=direct on the command line or
|
||||||
|
in your environment. We apologize for the problem.
|
||||||
|
#
|
||||||
|
[no-connect]
|
||||||
|
ORTE is unable to establish a communication connection to the
|
||||||
|
specified daemon located on the indicated node:
|
||||||
|
|
||||||
|
my node: %s
|
||||||
|
target node: %s
|
||||||
|
|
||||||
|
This is usually due to a lack of common network interfaces and/or
|
||||||
|
no route found between them. Please check network connectivity (including
|
||||||
|
firewalls and network routing requirements). If these look okay,
|
||||||
|
then it could be an internal programming error that should be
|
||||||
|
reported to the developers. In the meantime, a workaround may
|
||||||
|
be to set the MCA param routed=direct on the command line or
|
||||||
|
in your environment.
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||||
|
"%s errmgr:hnp: no message path to proc %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(proc)));
|
||||||
|
orte_show_help("help-errmgr-base.txt", "no-path", true,
|
||||||
|
orte_process_info.nodename, pptr->node->name);
|
||||||
|
/* if this proc is one of my daemons, then we are truly
|
||||||
|
* hosed - so just exit out
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||||
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||||
|
/* abnormal termination - abort, but only do it once
|
||||||
|
* to avoid creating a lot of confusion */
|
||||||
|
default_hnp_abort(jdata);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||||
|
"%s errmgr:hnp: cannot connect to proc %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(proc)));
|
||||||
|
orte_show_help("help-errmgr-base.txt", "no-connect", true,
|
||||||
|
orte_process_info.nodename, pptr->node->name);
|
||||||
|
/* if this proc is one of my daemons, then we are truly
|
||||||
|
* hosed - so just exit out
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||||
|
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||||
|
/* abnormal termination - abort, but only do it once
|
||||||
|
* to avoid creating a lot of confusion */
|
||||||
|
default_hnp_abort(jdata);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* shouldn't get this, but terminate job if required */
|
/* shouldn't get this, but terminate job if required */
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||||
|
@ -497,12 +497,6 @@ int orte_ess_base_orted_setup(char **hosts)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* be sure to update the routing tree so the initial "phone home"
|
|
||||||
* to mpirun goes through the tree if static ports were enabled - still
|
|
||||||
* need to do it anyway just to initialize things
|
|
||||||
*/
|
|
||||||
orte_routed.update_routing_plan(NULL);
|
|
||||||
|
|
||||||
/* if we are using static ports, then we need to setup
|
/* if we are using static ports, then we need to setup
|
||||||
* the daemon info so the RML can function properly
|
* the daemon info so the RML can function properly
|
||||||
* without requiring a wireup stage. This must be done
|
* without requiring a wireup stage. This must be done
|
||||||
@ -519,6 +513,12 @@ int orte_ess_base_orted_setup(char **hosts)
|
|||||||
error = "construct daemon map from static ports";
|
error = "construct daemon map from static ports";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
/* be sure to update the routing tree so the initial "phone home"
|
||||||
|
* to mpirun goes through the tree if static ports were enabled
|
||||||
|
*/
|
||||||
|
orte_routed.update_routing_plan(NULL);
|
||||||
|
/* routing can be enabled */
|
||||||
|
orte_routed_base.routing_enabled = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Now provide a chance for the PLM
|
/* Now provide a chance for the PLM
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rml/base/base.h"
|
#include "orte/mca/rml/base/base.h"
|
||||||
#include "orte/mca/rml/base/rml_contact.h"
|
#include "orte/mca/rml/base/rml_contact.h"
|
||||||
#include "orte/mca/routed/routed.h"
|
#include "orte/mca/routed/base/base.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
#include "orte/util/compress.h"
|
#include "orte/util/compress.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
@ -386,8 +386,14 @@ static void xcast_recv(int status, orte_process_name_t* sender,
|
|||||||
goto relay;
|
goto relay;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update the routing plan */
|
if (!ORTE_PROC_IS_HNP) {
|
||||||
orte_routed.update_routing_plan(rtmod);
|
/* update the routing plan - the HNP already did
|
||||||
|
* it when it computed the VM, so don't waste time
|
||||||
|
* re-doing it here */
|
||||||
|
orte_routed.update_routing_plan(rtmod);
|
||||||
|
}
|
||||||
|
/* routing is now possible */
|
||||||
|
orte_routed_base.routing_enabled = true;
|
||||||
|
|
||||||
/* see if we have wiring info as well */
|
/* see if we have wiring info as well */
|
||||||
cnt=1;
|
cnt=1;
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(cd);
|
OBJ_RELEASE(cd);
|
||||||
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||||
"%s oob:base:send to target %s",
|
"%s oob:base:send to target %s - %u attempt",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&msg->dst));
|
ORTE_NAME_PRINT(&msg->dst), msg->retries);
|
||||||
|
|
||||||
|
/* don't try forever - if we have exceeded the number of retries,
|
||||||
|
* then report this message as undeliverable even if someone continues
|
||||||
|
* to think they could reach it */
|
||||||
|
if (orte_rml_base.max_retries <= msg->retries) {
|
||||||
|
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
|
||||||
|
ORTE_RML_SEND_COMPLETE(msg);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* check if we have this peer in our hash table */
|
/* check if we have this peer in our hash table */
|
||||||
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
||||||
|
@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
|
|||||||
const struct sockaddr *addr);
|
const struct sockaddr *addr);
|
||||||
static void ping(const orte_process_name_t *proc);
|
static void ping(const orte_process_name_t *proc);
|
||||||
static void send_nb(orte_rml_send_t *msg);
|
static void send_nb(orte_rml_send_t *msg);
|
||||||
static void resend(struct mca_oob_tcp_msg_error_t *mop);
|
|
||||||
static void ft_event(int state);
|
static void ft_event(int state);
|
||||||
|
|
||||||
mca_oob_tcp_module_t mca_oob_tcp_module = {
|
mca_oob_tcp_module_t mca_oob_tcp_module = {
|
||||||
.accept_connection = accept_connection,
|
.accept_connection = accept_connection,
|
||||||
.ping = ping,
|
.ping = ping,
|
||||||
.send_nb = send_nb,
|
.send_nb = send_nb,
|
||||||
.resend = resend,
|
|
||||||
.ft_event = ft_event
|
.ft_event = ft_event
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
|
|
||||||
{
|
|
||||||
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
|
|
||||||
mca_oob_tcp_peer_t *peer;
|
|
||||||
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s:tcp processing resend to peer %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&mp->hop));
|
|
||||||
|
|
||||||
/* do we know this peer? */
|
|
||||||
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
|
|
||||||
/* push this back to the component so it can try
|
|
||||||
* another module within this transport. If no
|
|
||||||
* module can be found, the component can push back
|
|
||||||
* to the framework so another component can try
|
|
||||||
*/
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s:[%s:%d] peer %s unknown",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
__FILE__, __LINE__,
|
|
||||||
ORTE_NAME_PRINT(&mp->hop));
|
|
||||||
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* should be impossible, but...has this peer had a progress thread assigned yet? */
|
|
||||||
if (NULL == peer->ev_base) {
|
|
||||||
/* nope - assign one */
|
|
||||||
ORTE_OOB_TCP_NEXT_BASE(peer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* add the msg to this peer's send queue */
|
|
||||||
if (MCA_OOB_TCP_CONNECTED == peer->state) {
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s tcp:resend: already connected to %s - queueing for send",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&peer->name));
|
|
||||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (MCA_OOB_TCP_CONNECTING != peer->state &&
|
|
||||||
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
|
|
||||||
/* add the message to the queue for sending after the
|
|
||||||
* connection is formed
|
|
||||||
*/
|
|
||||||
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
|
|
||||||
/* we have to initiate the connection - again, we do not
|
|
||||||
* want to block while the connection is created.
|
|
||||||
* So throw us into an event that will create
|
|
||||||
* the connection via a mini-state-machine :-)
|
|
||||||
*/
|
|
||||||
opal_output_verbose(2, orte_oob_base_framework.framework_output,
|
|
||||||
"%s tcp:send_nb: initiating connection to %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&peer->name));
|
|
||||||
peer->state = MCA_OOB_TCP_CONNECTING;
|
|
||||||
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Event callback when there is data available on the registered
|
* Event callback when there is data available on the registered
|
||||||
* socket to recv. This is called for the listen sockets to accept an
|
* socket to recv. This is called for the listen sockets to accept an
|
||||||
|
@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
|
|||||||
const struct sockaddr *addr);
|
const struct sockaddr *addr);
|
||||||
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
|
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
|
||||||
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
|
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
|
||||||
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
|
|
||||||
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
|
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
|
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
|
||||||
mca_oob_tcp_module_ping_fn_t ping;
|
mca_oob_tcp_module_ping_fn_t ping;
|
||||||
mca_oob_tcp_module_send_nb_fn_t send_nb;
|
mca_oob_tcp_module_send_nb_fn_t send_nb;
|
||||||
mca_oob_tcp_module_resend_nb_fn_t resend;
|
|
||||||
mca_oob_tcp_module_ft_event_fn_t ft_event;
|
mca_oob_tcp_module_ft_event_fn_t ft_event;
|
||||||
} mca_oob_tcp_module_t;
|
} mca_oob_tcp_module_t;
|
||||||
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
|
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;
|
||||||
|
@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
|
|||||||
/* report the error back to the OOB and let it try other components
|
/* report the error back to the OOB and let it try other components
|
||||||
* or declare a problem
|
* or declare a problem
|
||||||
*/
|
*/
|
||||||
if (!orte_finalizing && !orte_abnormal_term_ordered) {
|
mop->rmsg->retries++;
|
||||||
/* if this was a lifeline, then alert */
|
/* activate the OOB send state */
|
||||||
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) {
|
ORTE_OOB_SEND(mop->rmsg);
|
||||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
|
|
||||||
} else {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
OBJ_RELEASE(mop);
|
OBJ_RELEASE(mop);
|
||||||
}
|
}
|
||||||
@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
|
|||||||
*/
|
*/
|
||||||
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
|
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
|
||||||
snd = OBJ_NEW(orte_rml_send_t);
|
snd = OBJ_NEW(orte_rml_send_t);
|
||||||
|
snd->retries = mop->rmsg->retries + 1;
|
||||||
snd->dst = mop->snd->hdr.dst;
|
snd->dst = mop->snd->hdr.dst;
|
||||||
snd->origin = mop->snd->hdr.origin;
|
snd->origin = mop->snd->hdr.origin;
|
||||||
snd->tag = mop->snd->hdr.tag;
|
snd->tag = mop->snd->hdr.tag;
|
||||||
@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&pop->peer));
|
ORTE_NAME_PRINT(&pop->peer));
|
||||||
|
|
||||||
/* if this was a lifeline, then alert */
|
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
|
||||||
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
|
|
||||||
} else {
|
|
||||||
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(pop);
|
OBJ_RELEASE(pop);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
|||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
|
|
||||||
do { \
|
|
||||||
mca_oob_tcp_msg_error_t *mp; \
|
|
||||||
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
|
|
||||||
"%s:[%s:%d] post resend to %s", \
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
|
||||||
__FILE__, __LINE__, \
|
|
||||||
ORTE_NAME_PRINT(&((mop)->hop))); \
|
|
||||||
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
|
|
||||||
mp->snd = (mop)->snd; \
|
|
||||||
mp->hop = (mop)->hop; \
|
|
||||||
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
|
|
||||||
OPAL_EV_WRITE, (cbfunc), mp); \
|
|
||||||
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
} while(0);
|
|
||||||
|
|
||||||
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
|
||||||
do { \
|
do { \
|
||||||
mca_oob_tcp_msg_error_t *mop; \
|
mca_oob_tcp_msg_error_t *mop; \
|
||||||
@ -320,8 +303,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
|
|||||||
mop->rmsg = (r); \
|
mop->rmsg = (r); \
|
||||||
mop->hop.jobid = (h)->jobid; \
|
mop->hop.jobid = (h)->jobid; \
|
||||||
mop->hop.vpid = (h)->vpid; \
|
mop->hop.vpid = (h)->vpid; \
|
||||||
/* this goes to the OOB framework, so use that event base */ \
|
/* this goes to the component, so use the framework \
|
||||||
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
* event base */ \
|
||||||
|
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
|
||||||
OPAL_EV_WRITE, (c), mop); \
|
OPAL_EV_WRITE, (c), mop); \
|
||||||
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
|
||||||
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
|
||||||
|
@ -410,15 +410,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_process_info.num_procs = jdatorted->num_procs;
|
|
||||||
|
|
||||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
|
||||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ensure all routing plans are up-to-date */
|
|
||||||
orte_routed.update_routing_plan(NULL);
|
|
||||||
|
|
||||||
/* If this job is being started by me, then there is nothing
|
/* If this job is being started by me, then there is nothing
|
||||||
* further we need to do as any user directives (e.g., to tie
|
* further we need to do as any user directives (e.g., to tie
|
||||||
* off IO to /dev/null) will have been included in the launch
|
* off IO to /dev/null) will have been included in the launch
|
||||||
@ -2158,7 +2149,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
|||||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ensure all routing plans are up-to-date */
|
/* ensure all routing plans are up-to-date - we need this
|
||||||
|
* so we know how to tree-spawn and/or xcast info */
|
||||||
orte_routed.update_routing_plan(NULL);
|
orte_routed.update_routing_plan(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -81,7 +81,10 @@ typedef uint32_t orte_proc_state_t;
|
|||||||
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
|
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
|
||||||
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
|
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
|
||||||
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
|
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
|
||||||
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* unable to send a message */
|
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* connection to lifeline lost */
|
||||||
|
#define ORTE_PROC_STATE_NO_PATH_TO_TARGET (ORTE_PROC_STATE_ERROR + 16) /* no path for communicating to target peer */
|
||||||
|
#define ORTE_PROC_STATE_FAILED_TO_CONNECT (ORTE_PROC_STATE_ERROR + 17) /* unable to connect to target peer */
|
||||||
|
#define ORTE_PROC_STATE_PEER_UNKNOWN (ORTE_PROC_STATE_ERROR + 18) /* unknown peer */
|
||||||
|
|
||||||
/* Define a boundary so that external developers
|
/* Define a boundary so that external developers
|
||||||
* have a starting point for defining their own
|
* have a starting point for defining their own
|
||||||
|
@ -801,6 +801,15 @@ static int remote_spawn(opal_buffer_t *launch)
|
|||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* extract and update the daemon map */
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* since we are tree-spawning, we need to update the routing plan */
|
||||||
|
orte_routed.update_routing_plan(NULL);
|
||||||
|
|
||||||
/* get the updated routing list */
|
/* get the updated routing list */
|
||||||
rtmod = orte_rml.get_routed(orte_coll_conduit);
|
rtmod = orte_rml.get_routed(orte_coll_conduit);
|
||||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -231,7 +231,13 @@ void orte_rml_send_callback(int status, orte_process_name_t *peer,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(peer), tag,
|
ORTE_NAME_PRINT(peer), tag,
|
||||||
ORTE_ERROR_NAME(status));
|
ORTE_ERROR_NAME(status));
|
||||||
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
if (ORTE_ERR_NO_PATH_TO_TARGET == status) {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_NO_PATH_TO_TARGET);
|
||||||
|
} else if (ORTE_ERR_ADDRESSEE_UNKNOWN == status) {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_PEER_UNKNOWN);
|
||||||
|
} else {
|
||||||
|
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,2 +1 @@
|
|||||||
anandhis
|
anandhis
|
||||||
rhc
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2007-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2007-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -41,6 +41,7 @@ OBJ_CLASS_DECLARATION(orte_routed_base_active_t);
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
opal_list_t actives;
|
opal_list_t actives;
|
||||||
|
bool routing_enabled;
|
||||||
} orte_routed_base_t;
|
} orte_routed_base_t;
|
||||||
ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base;
|
ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base;
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -110,7 +110,7 @@ orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t
|
|||||||
orte_routed_base_active_t *active;
|
orte_routed_base_active_t *active;
|
||||||
|
|
||||||
/* a NULL module corresponds to direct */
|
/* a NULL module corresponds to direct */
|
||||||
if (NULL == module) {
|
if (!orte_routed_base.routing_enabled || NULL == module) {
|
||||||
return *target;
|
return *target;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,6 +178,7 @@ void orte_routed_base_update_routing_plan(char *module)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -58,6 +58,8 @@ static int orte_routed_base_open(mca_base_open_flag_t flags)
|
|||||||
{
|
{
|
||||||
/* setup our list of actives */
|
/* setup our list of actives */
|
||||||
OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t);
|
OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t);
|
||||||
|
/* start with routing DISABLED */
|
||||||
|
orte_routed_base.routing_enabled = false;
|
||||||
|
|
||||||
/* Open up all available components */
|
/* Open up all available components */
|
||||||
return mca_base_framework_components_open(&orte_routed_base_framework, flags);
|
return mca_base_framework_components_open(&orte_routed_base_framework, flags);
|
||||||
|
@ -49,7 +49,7 @@ static int orte_routed_debruijn_component_query(mca_base_module_t **module, int
|
|||||||
* systems, we will allow other options that have even fewer hops to
|
* systems, we will allow other options that have even fewer hops to
|
||||||
* support wireup
|
* support wireup
|
||||||
*/
|
*/
|
||||||
*priority = 70;
|
*priority = 10;
|
||||||
*module = (mca_base_module_t *) &orte_routed_debruijn_module;
|
*module = (mca_base_module_t *) &orte_routed_debruijn_module;
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -538,4 +538,3 @@ static int radix_ft_event(int state)
|
|||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@ static int orte_routed_radix_component_query(mca_base_module_t **module, int *pr
|
|||||||
return ORTE_ERR_BAD_PARAM;
|
return ORTE_ERR_BAD_PARAM;
|
||||||
}
|
}
|
||||||
|
|
||||||
*priority = 50;
|
*priority = 70;
|
||||||
*module = (mca_base_module_t *) &orte_routed_radix_module;
|
*module = (mca_base_module_t *) &orte_routed_radix_module;
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -317,7 +317,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Send the request to termiante
|
* Send the request to terminate
|
||||||
*/
|
*/
|
||||||
if( num_new_procs > 0 ) {
|
if( num_new_procs > 0 ) {
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||||
|
@ -344,6 +344,7 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init
|
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init
|
||||||
* we continue to have a reference count on them. So we have to finalize them twice...
|
* we continue to have a reference count on them. So we have to finalize them twice...
|
||||||
*/
|
*/
|
||||||
@ -647,7 +648,6 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
/* If I have a parent, then save his contact info so
|
/* If I have a parent, then save his contact info so
|
||||||
* any messages we send can flow thru him.
|
* any messages we send can flow thru him.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
orte_parent_uri = NULL;
|
orte_parent_uri = NULL;
|
||||||
(void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
|
(void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
|
||||||
"URI for the parent if tree launch is enabled.",
|
"URI for the parent if tree launch is enabled.",
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -434,6 +434,12 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
|||||||
return "UNABLE TO SEND MSG";
|
return "UNABLE TO SEND MSG";
|
||||||
case ORTE_PROC_STATE_LIFELINE_LOST:
|
case ORTE_PROC_STATE_LIFELINE_LOST:
|
||||||
return "LIFELINE LOST";
|
return "LIFELINE LOST";
|
||||||
|
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
|
||||||
|
return "NO PATH TO TARGET";
|
||||||
|
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
|
||||||
|
return "FAILED TO CONNECT";
|
||||||
|
case ORTE_PROC_STATE_PEER_UNKNOWN:
|
||||||
|
return "PEER UNKNOWN";
|
||||||
case ORTE_PROC_STATE_ANY:
|
case ORTE_PROC_STATE_ANY:
|
||||||
return "ANY";
|
return "ANY";
|
||||||
default:
|
default:
|
||||||
|
@ -62,6 +62,7 @@
|
|||||||
#include "orte/mca/dfs/dfs.h"
|
#include "orte/mca/dfs/dfs.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/odls/base/odls_private.h"
|
#include "orte/mca/odls/base/odls_private.h"
|
||||||
|
#include "orte/mca/routed/routed.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
@ -686,7 +687,11 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
|||||||
free(dids);
|
free(dids);
|
||||||
|
|
||||||
/* unpdate num procs */
|
/* unpdate num procs */
|
||||||
orte_process_info.num_procs = daemons->num_procs;
|
if (orte_process_info.num_procs != daemons->num_procs) {
|
||||||
|
orte_process_info.num_procs = daemons->num_procs;
|
||||||
|
/* need to update the routing plan */
|
||||||
|
orte_routed.update_routing_plan(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user