1
1

Merge pull request #2864 from rhc54/topic/rsh

Repair rsh/ssh tree spawn
Этот коммит содержится в:
Ralph Castain 2017-01-27 16:31:35 -08:00 коммит произвёл GitHub
родитель 3440b46e5e 7c795f4416
Коммит 410befd255
24 изменённых файлов: 156 добавлений и 140 удалений

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California. # Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved. # Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of connection to the node, or possibly an internal failure of
the daemon itself. We cannot recover from this failure, and the daemon itself. We cannot recover from this failure, and
therefore will terminate the job. therefore will terminate the job.
#
[no-path]
ORTE does not know how to route a message to the specified daemon
located on the indicated node:
my node: %s
target node: %s
This is usually an internal programming error that should be
reported to the developers. In the meantime, a workaround may
be to set the MCA param routed=direct on the command line or
in your environment. We apologize for the problem.
#
[no-connect]
ORTE is unable to establish a communication connection to the
specified daemon located on the indicated node:
my node: %s
target node: %s
This is usually due to a lack of common network interfaces and/or
no route found between them. Please check network connectivity (including
firewalls and network routing requirements). If these look okay,
then it could be an internal programming error that should be
reported to the developers. In the meantime, a workaround may
be to set the MCA param routed=direct on the command line or
in your environment.

Просмотреть файл

@ -9,7 +9,7 @@
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved. * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
} }
break; break;
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:hnp: no message path to proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
orte_show_help("help-errmgr-base.txt", "no-path", true,
orte_process_info.nodename, pptr->node->name);
/* if this proc is one of my daemons, then we are truly
* hosed - so just exit out
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
break;
}
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* abnormal termination - abort, but only do it once
* to avoid creating a lot of confusion */
default_hnp_abort(jdata);
}
break;
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:hnp: cannot connect to proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
orte_show_help("help-errmgr-base.txt", "no-connect", true,
orte_process_info.nodename, pptr->node->name);
/* if this proc is one of my daemons, then we are truly
* hosed - so just exit out
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
break;
}
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* abnormal termination - abort, but only do it once
* to avoid creating a lot of confusion */
default_hnp_abort(jdata);
}
break;
default: default:
/* shouldn't get this, but terminate job if required */ /* shouldn't get this, but terminate job if required */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,

Просмотреть файл

@ -497,12 +497,6 @@ int orte_ess_base_orted_setup(char **hosts)
goto error; goto error;
} }
/* be sure to update the routing tree so the initial "phone home"
* to mpirun goes through the tree if static ports were enabled - still
* need to do it anyway just to initialize things
*/
orte_routed.update_routing_plan(NULL);
/* if we are using static ports, then we need to setup /* if we are using static ports, then we need to setup
* the daemon info so the RML can function properly * the daemon info so the RML can function properly
* without requiring a wireup stage. This must be done * without requiring a wireup stage. This must be done
@ -519,6 +513,12 @@ int orte_ess_base_orted_setup(char **hosts)
error = "construct daemon map from static ports"; error = "construct daemon map from static ports";
goto error; goto error;
} }
/* be sure to update the routing tree so the initial "phone home"
* to mpirun goes through the tree if static ports were enabled
*/
orte_routed.update_routing_plan(NULL);
/* routing can be enabled */
orte_routed_base.routing_enabled = true;
} }
/* Now provide a chance for the PLM /* Now provide a chance for the PLM

Просмотреть файл

@ -27,7 +27,7 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/base/base.h" #include "orte/mca/rml/base/base.h"
#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/routed.h" #include "orte/mca/routed/base/base.h"
#include "orte/mca/state/state.h" #include "orte/mca/state/state.h"
#include "orte/util/compress.h" #include "orte/util/compress.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
@ -386,8 +386,14 @@ static void xcast_recv(int status, orte_process_name_t* sender,
goto relay; goto relay;
} }
/* update the routing plan */ if (!ORTE_PROC_IS_HNP) {
orte_routed.update_routing_plan(rtmod); /* update the routing plan - the HNP already did
* it when it computed the VM, so don't waste time
* re-doing it here */
orte_routed.update_routing_plan(rtmod);
}
/* routing is now possible */
orte_routed_base.routing_enabled = true;
/* see if we have wiring info as well */ /* see if we have wiring info as well */
cnt=1; cnt=1;

Просмотреть файл

@ -2,7 +2,7 @@
/* /*
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
OBJ_RELEASE(cd); OBJ_RELEASE(cd);
opal_output_verbose(5, orte_oob_base_framework.framework_output, opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:base:send to target %s", "%s oob:base:send to target %s - %u attempt",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&msg->dst)); ORTE_NAME_PRINT(&msg->dst), msg->retries);
/* don't try forever - if we have exceeded the number of retries,
* then report this message as undeliverable even if someone continues
* to think they could reach it */
if (orte_rml_base.max_retries <= msg->retries) {
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
ORTE_RML_SEND_COMPLETE(msg);
return;
}
/* check if we have this peer in our hash table */ /* check if we have this peer in our hash table */
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t)); memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));

Просмотреть файл

@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
const struct sockaddr *addr); const struct sockaddr *addr);
static void ping(const orte_process_name_t *proc); static void ping(const orte_process_name_t *proc);
static void send_nb(orte_rml_send_t *msg); static void send_nb(orte_rml_send_t *msg);
static void resend(struct mca_oob_tcp_msg_error_t *mop);
static void ft_event(int state); static void ft_event(int state);
mca_oob_tcp_module_t mca_oob_tcp_module = { mca_oob_tcp_module_t mca_oob_tcp_module = {
.accept_connection = accept_connection, .accept_connection = accept_connection,
.ping = ping, .ping = ping,
.send_nb = send_nb, .send_nb = send_nb,
.resend = resend,
.ft_event = ft_event .ft_event = ft_event
}; };
@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
} }
} }
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
{
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
mca_oob_tcp_peer_t *peer;
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:tcp processing resend to peer %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mp->hop));
/* do we know this peer? */
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
/* push this back to the component so it can try
* another module within this transport. If no
* module can be found, the component can push back
* to the framework so another component can try
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s:[%s:%d] peer %s unknown",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
__FILE__, __LINE__,
ORTE_NAME_PRINT(&mp->hop));
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
return;
}
/* should be impossible, but...has this peer had a progress thread assigned yet? */
if (NULL == peer->ev_base) {
/* nope - assign one */
ORTE_OOB_TCP_NEXT_BASE(peer);
}
/* add the msg to this peer's send queue */
if (MCA_OOB_TCP_CONNECTED == peer->state) {
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s tcp:resend: already connected to %s - queueing for send",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name));
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
return;
}
if (MCA_OOB_TCP_CONNECTING != peer->state &&
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
/* add the message to the queue for sending after the
* connection is formed
*/
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
/* we have to initiate the connection - again, we do not
* want to block while the connection is created.
* So throw us into an event that will create
* the connection via a mini-state-machine :-)
*/
opal_output_verbose(2, orte_oob_base_framework.framework_output,
"%s tcp:send_nb: initiating connection to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer->name));
peer->state = MCA_OOB_TCP_CONNECTING;
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
}
}
/* /*
* Event callback when there is data available on the registered * Event callback when there is data available on the registered
* socket to recv. This is called for the listen sockets to accept an * socket to recv. This is called for the listen sockets to accept an

Просмотреть файл

@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
const struct sockaddr *addr); const struct sockaddr *addr);
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc); typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg); typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state); typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
typedef struct { typedef struct {
mca_oob_tcp_module_accept_connection_fn_t accept_connection; mca_oob_tcp_module_accept_connection_fn_t accept_connection;
mca_oob_tcp_module_ping_fn_t ping; mca_oob_tcp_module_ping_fn_t ping;
mca_oob_tcp_module_send_nb_fn_t send_nb; mca_oob_tcp_module_send_nb_fn_t send_nb;
mca_oob_tcp_module_resend_nb_fn_t resend;
mca_oob_tcp_module_ft_event_fn_t ft_event; mca_oob_tcp_module_ft_event_fn_t ft_event;
} mca_oob_tcp_module_t; } mca_oob_tcp_module_t;
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module; ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;

Просмотреть файл

@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
/* report the error back to the OOB and let it try other components /* report the error back to the OOB and let it try other components
* or declare a problem * or declare a problem
*/ */
if (!orte_finalizing && !orte_abnormal_term_ordered) { mop->rmsg->retries++;
/* if this was a lifeline, then alert */ /* activate the OOB send state */
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) { ORTE_OOB_SEND(mop->rmsg);
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
}
}
OBJ_RELEASE(mop); OBJ_RELEASE(mop);
} }
@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
*/ */
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr); MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
snd = OBJ_NEW(orte_rml_send_t); snd = OBJ_NEW(orte_rml_send_t);
snd->retries = mop->rmsg->retries + 1;
snd->dst = mop->snd->hdr.dst; snd->dst = mop->snd->hdr.dst;
snd->origin = mop->snd->hdr.origin; snd->origin = mop->snd->hdr.origin;
snd->tag = mop->snd->hdr.tag; snd->tag = mop->snd->hdr.tag;
@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pop->peer)); ORTE_NAME_PRINT(&pop->peer));
/* if this was a lifeline, then alert */ ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
}
OBJ_RELEASE(pop); OBJ_RELEASE(pop);
} }

Просмотреть файл

@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
} while(0); } while(0);
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
do { \
mca_oob_tcp_msg_error_t *mp; \
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
"%s:[%s:%d] post resend to %s", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__, \
ORTE_NAME_PRINT(&((mop)->hop))); \
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
mp->snd = (mop)->snd; \
mp->hop = (mop)->hop; \
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
OPAL_EV_WRITE, (cbfunc), mp); \
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
} while(0);
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \ #define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
do { \ do { \
mca_oob_tcp_msg_error_t *mop; \ mca_oob_tcp_msg_error_t *mop; \
@ -320,8 +303,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
mop->rmsg = (r); \ mop->rmsg = (r); \
mop->hop.jobid = (h)->jobid; \ mop->hop.jobid = (h)->jobid; \
mop->hop.vpid = (h)->vpid; \ mop->hop.vpid = (h)->vpid; \
/* this goes to the OOB framework, so use that event base */ \ /* this goes to the component, so use the framework \
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \ * event base */ \
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
OPAL_EV_WRITE, (c), mop); \ OPAL_EV_WRITE, (c), mop); \
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \ opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \ opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \

Просмотреть файл

@ -410,15 +410,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
return; return;
} }
orte_process_info.num_procs = jdatorted->num_procs;
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* ensure all routing plans are up-to-date */
orte_routed.update_routing_plan(NULL);
/* If this job is being started by me, then there is nothing /* If this job is being started by me, then there is nothing
* further we need to do as any user directives (e.g., to tie * further we need to do as any user directives (e.g., to tie
* off IO to /dev/null) will have been included in the launch * off IO to /dev/null) will have been included in the launch
@ -2158,7 +2149,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
orte_process_info.max_procs = orte_process_info.num_procs; orte_process_info.max_procs = orte_process_info.num_procs;
} }
/* ensure all routing plans are up-to-date */ /* ensure all routing plans are up-to-date - we need this
* so we know how to tree-spawn and/or xcast info */
orte_routed.update_routing_plan(NULL); orte_routed.update_routing_plan(NULL);
} }

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -81,7 +81,10 @@ typedef uint32_t orte_proc_state_t;
#define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */ #define ORTE_PROC_STATE_TERM_NON_ZERO (ORTE_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
#define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */ #define ORTE_PROC_STATE_FAILED_TO_LAUNCH (ORTE_PROC_STATE_ERROR + 13) /* unable to launch process */
#define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */ #define ORTE_PROC_STATE_UNABLE_TO_SEND_MSG (ORTE_PROC_STATE_ERROR + 14) /* unable to send a message */
#define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* unable to send a message */ #define ORTE_PROC_STATE_LIFELINE_LOST (ORTE_PROC_STATE_ERROR + 15) /* connection to lifeline lost */
#define ORTE_PROC_STATE_NO_PATH_TO_TARGET (ORTE_PROC_STATE_ERROR + 16) /* no path for communicating to target peer */
#define ORTE_PROC_STATE_FAILED_TO_CONNECT (ORTE_PROC_STATE_ERROR + 17) /* unable to connect to target peer */
#define ORTE_PROC_STATE_PEER_UNKNOWN (ORTE_PROC_STATE_ERROR + 18) /* unknown peer */
/* Define a boundary so that external developers /* Define a boundary so that external developers
* have a starting point for defining their own * have a starting point for defining their own

Просмотреть файл

@ -801,6 +801,15 @@ static int remote_spawn(opal_buffer_t *launch)
goto cleanup; goto cleanup;
} }
/* extract and update the daemon map */
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* since we are tree-spawning, we need to update the routing plan */
orte_routed.update_routing_plan(NULL);
/* get the updated routing list */ /* get the updated routing list */
rtmod = orte_rml.get_routed(orte_coll_conduit); rtmod = orte_rml.get_routed(orte_coll_conduit);
OBJ_CONSTRUCT(&coll, opal_list_t); OBJ_CONSTRUCT(&coll, opal_list_t);

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science * Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -231,7 +231,13 @@ void orte_rml_send_callback(int status, orte_process_name_t *peer,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), tag, ORTE_NAME_PRINT(peer), tag,
ORTE_ERROR_NAME(status)); ORTE_ERROR_NAME(status));
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG); if (ORTE_ERR_NO_PATH_TO_TARGET == status) {
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_NO_PATH_TO_TARGET);
} else if (ORTE_ERR_ADDRESSEE_UNKNOWN == status) {
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_PEER_UNKNOWN);
} else {
ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG);
}
} }
} }

Просмотреть файл

@ -1,2 +1 @@
anandhis anandhis
rhc

Просмотреть файл

@ -1,7 +1,7 @@
/* /*
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. * Copyright (c) 2007-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -41,6 +41,7 @@ OBJ_CLASS_DECLARATION(orte_routed_base_active_t);
typedef struct { typedef struct {
opal_list_t actives; opal_list_t actives;
bool routing_enabled;
} orte_routed_base_t; } orte_routed_base_t;
ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base; ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base;

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -110,7 +110,7 @@ orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t
orte_routed_base_active_t *active; orte_routed_base_active_t *active;
/* a NULL module corresponds to direct */ /* a NULL module corresponds to direct */
if (NULL == module) { if (!orte_routed_base.routing_enabled || NULL == module) {
return *target; return *target;
} }
@ -178,6 +178,7 @@ void orte_routed_base_update_routing_plan(char *module)
} }
} }
} }
return; return;
} }

Просмотреть файл

@ -10,7 +10,7 @@
* reserved. * reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved. * Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -58,6 +58,8 @@ static int orte_routed_base_open(mca_base_open_flag_t flags)
{ {
/* setup our list of actives */ /* setup our list of actives */
OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t); OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t);
/* start with routing DISABLED */
orte_routed_base.routing_enabled = false;
/* Open up all available components */ /* Open up all available components */
return mca_base_framework_components_open(&orte_routed_base_framework, flags); return mca_base_framework_components_open(&orte_routed_base_framework, flags);

Просмотреть файл

@ -49,7 +49,7 @@ static int orte_routed_debruijn_component_query(mca_base_module_t **module, int
* systems, we will allow other options that have even fewer hops to * systems, we will allow other options that have even fewer hops to
* support wireup * support wireup
*/ */
*priority = 70; *priority = 10;
*module = (mca_base_module_t *) &orte_routed_debruijn_module; *module = (mca_base_module_t *) &orte_routed_debruijn_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -6,7 +6,7 @@
* reserved. * reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -538,4 +538,3 @@ static int radix_ft_event(int state)
return exit_status; return exit_status;
} }
#endif #endif

Просмотреть файл

@ -69,7 +69,7 @@ static int orte_routed_radix_component_query(mca_base_module_t **module, int *pr
return ORTE_ERR_BAD_PARAM; return ORTE_ERR_BAD_PARAM;
} }
*priority = 50; *priority = 70;
*module = (mca_base_module_t *) &orte_routed_radix_module; *module = (mca_base_module_t *) &orte_routed_radix_module;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -317,7 +317,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
} }
/* /*
* Send the request to termiante * Send the request to terminate
*/ */
if( num_new_procs > 0 ) { if( num_new_procs > 0 ) {
OPAL_OUTPUT_VERBOSE((2, orte_debug_output, OPAL_OUTPUT_VERBOSE((2, orte_debug_output,

Просмотреть файл

@ -344,6 +344,7 @@ int orte_daemon(int argc, char *argv[])
return ret; return ret;
} }
} }
/* finalize the OPAL utils. As they are opened again from orte_init->opal_init /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
* we continue to have a reference count on them. So we have to finalize them twice... * we continue to have a reference count on them. So we have to finalize them twice...
*/ */
@ -647,7 +648,6 @@ int orte_daemon(int argc, char *argv[])
/* If I have a parent, then save his contact info so /* If I have a parent, then save his contact info so
* any messages we send can flow thru him. * any messages we send can flow thru him.
*/ */
orte_parent_uri = NULL; orte_parent_uri = NULL;
(void) mca_base_var_register ("orte", "orte", NULL, "parent_uri", (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
"URI for the parent if tree launch is enabled.", "URI for the parent if tree launch is enabled.",

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -434,6 +434,12 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
return "UNABLE TO SEND MSG"; return "UNABLE TO SEND MSG";
case ORTE_PROC_STATE_LIFELINE_LOST: case ORTE_PROC_STATE_LIFELINE_LOST:
return "LIFELINE LOST"; return "LIFELINE LOST";
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
return "NO PATH TO TARGET";
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
return "FAILED TO CONNECT";
case ORTE_PROC_STATE_PEER_UNKNOWN:
return "PEER UNKNOWN";
case ORTE_PROC_STATE_ANY: case ORTE_PROC_STATE_ANY:
return "ANY"; return "ANY";
default: default:

Просмотреть файл

@ -62,6 +62,7 @@
#include "orte/mca/dfs/dfs.h" #include "orte/mca/dfs/dfs.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/base/odls_private.h" #include "orte/mca/odls/base/odls_private.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
@ -686,7 +687,11 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
free(dids); free(dids);
/* unpdate num procs */ /* unpdate num procs */
orte_process_info.num_procs = daemons->num_procs; if (orte_process_info.num_procs != daemons->num_procs) {
orte_process_info.num_procs = daemons->num_procs;
/* need to update the routing plan */
orte_routed.update_routing_plan(NULL);
}
if (orte_process_info.max_procs < orte_process_info.num_procs) { if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs; orte_process_info.max_procs = orte_process_info.num_procs;