1
1

By popular demand the epoch code is now disabled by default.

To enable the epochs and the resilient orte code, use the configure flag:

--enable-resilient-orte

This will define both:

ORTE_ENABLE_EPOCH
ORTE_RESIL_ORTE

This commit was SVN r25093.
This commit is contained in:
Wesley Bland 2011-08-26 22:16:14 +00:00
parent 55a7b474dd
commit 4e7ff0bd5e
101 changed files with 652 additions and 362 deletions

View File

@ -693,8 +693,16 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
bool found = false;
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
"jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid, process_name->epoch, subnet_id, lid));
"jobid %d, vpid %d, "
#if ORTE_ENABLE_EPOCH
"epoch %d, "
#endif
"sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid,
#if ORTE_ENABLE_EPOCH
process_name->epoch,
#endif
subnet_id, lid));
/* find ibproc */
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
for (ib_proc = (mca_btl_openib_proc_t*)

View File

@ -1208,7 +1208,8 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
peer = OBJ_NEW(orte_namelist_t);
peer->name.jobid = comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid;
peer->name.vpid = comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid;
peer->name.epoch = comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch;
ORTE_EPOCH_SET(peer->name.epoch,comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch);
opal_list_append(&peers, &peer->item);
}
/* prepare send data */

View File

@ -702,7 +702,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN);
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
@ -730,7 +730,7 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN);
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
HOKE_TRAFFIC_MSG_REF_RETURN(item);
@ -840,7 +840,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -868,7 +868,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -902,7 +902,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -934,7 +934,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -954,7 +954,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN);
}
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
@ -962,7 +962,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN);
}
@ -1015,7 +1015,7 @@ do { \
}
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
{ \
HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1034,7 +1034,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \
\
msg_ref->matched = 0; \
msg_ref->done = 0; \
@ -1043,7 +1043,7 @@ do { \
msg_ref->active_drain = 0; \
}
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
{ \
HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1063,7 +1063,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \
}
@ -1466,7 +1466,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch;
ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch);
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
}
@ -3237,13 +3237,11 @@ static int traffic_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid,
peer_ref->proc_name.epoch);
peer_ref->proc_name.vpid);
} else {
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
ORTE_JOBID_INVALID, ORTE_VPID_INVALID,
ORTE_EPOCH_INVALID);
ORTE_JOBID_INVALID, ORTE_VPID_INVALID);
}
if( msg_type == COORD_MSG_TYPE_P_SEND ||
@ -3377,7 +3375,7 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
(*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch;
ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch);
}
return exit_status;
@ -3808,8 +3806,7 @@ static int drain_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type,
count, NULL, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid,
peer_ref->proc_name.epoch);
peer_ref->proc_name.vpid);
(*msg_ref)->done = 0;
(*msg_ref)->active = 0;
@ -5284,8 +5281,7 @@ static int send_bookmarks(int peer_idx)
*/
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = ORTE_EPOCH_INVALID;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name));
if( NULL == (peer_ref = find_peer(peer_name))) {
opal_output(mca_crcp_bkmrk_component.super.output_handle,
@ -5346,8 +5342,7 @@ static int recv_bookmarks(int peer_idx)
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = ORTE_EPOCH_INVALID;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name));
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
OMPI_CRCP_COORD_BOOKMARK_TAG,
@ -5529,7 +5524,8 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
d_msg_ack->peer.epoch = peer_ref->proc_name.epoch;
ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch);
d_msg_ack->complete = false;
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle,
@ -6169,8 +6165,7 @@ static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t *peer_r
count, datatype_size, tag, rank,
ompi_comm_lookup(comm_id),
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid,
peer_ref->proc_name.epoch);
peer_ref->proc_name.vpid);
traffic_message_create_drain_message(true, num_left_unresolved,
peer_ref,

View File

@ -1130,7 +1130,7 @@ static void process_cb(int fd, short event, void *data)
/* flag the identity of the remote proc */
carport.jobid = mev->sender.jobid;
carport.vpid = mev->sender.vpid;
carport.epoch = mev->sender.epoch;
ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch);
/* release the event */
OBJ_RELEASE(mev);

View File

@ -1,8 +1,5 @@
/*
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -398,13 +395,13 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
orte_proc.epoch = hdr->hdr_restart.hdr_epoch;
ompi_proc = ompi_proc_find(&orte_proc);
opal_output_verbose(20, mca_pml_bfo_output,
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
"hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, ompi_proc->proc_hostname=%s",
"hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
@ -413,7 +410,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
recvreq->remote_req_send.pval, (void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
hdr->hdr_restart.hdr_epoch, ompi_proc->proc_hostname);
ompi_proc->proc_hostname);
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
return;
}
@ -715,7 +712,6 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch;
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);

View File

@ -2,9 +2,6 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
@ -415,7 +412,6 @@ struct mca_pml_bfo_restart_hdr_t {
int32_t hdr_dst_rank; /**< needed to send NACK */
uint32_t hdr_jobid; /**< needed to send NACK */
uint32_t hdr_vpid; /**< needed to send NACK */
uint32_t hdr_epoch; /**< needed to send NACK */
};
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
@ -428,7 +424,6 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
(h).hdr_jobid = ntohl((h).hdr_jobid); \
(h).hdr_vpid = ntohl((h).hdr_vpid); \
(h).hdr_epoch = ntohl((h).hdr_epoch); \
} while (0)
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
@ -437,7 +432,6 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
(h).hdr_jobid = htonl((h).hdr_jobid); \
(h).hdr_vpid = htonl((h).hdr_vpid); \
(h).hdr_epoch = htonl((h).hdr_epoch); \
} while (0)
#endif /* PML_BFO */

View File

@ -108,7 +108,8 @@ int ompi_proc_init(void)
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.vpid = i;
proc->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN);
if (i == ORTE_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->proc_flags = OPAL_PROC_ALL_LOCAL;
@ -362,8 +363,7 @@ int ompi_proc_refresh(void) {
/* Does not change: proc->proc_name.vpid */
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.epoch = ORTE_EPOCH_INVALID;
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name));
/* Make sure to clear the local flag before we set it below */
proc->proc_flags = 0;

View File

@ -415,6 +415,14 @@ AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT_CR], [$opal_want_ft_cr],
AM_CONDITIONAL(WANT_FT, test "$opal_want_ft" = "1")
AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1")
#
# Compile in resilient runtime code
#
AC_ARG_ENABLE(resilient-orte,
[AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient runtime code.])])
AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"])
AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"])
#
# Do we want to install binaries?
#

View File

@ -81,24 +81,43 @@ typedef uint32_t orte_vpid_t;
#define ORTE_VPID_T OPAL_UINT32
#define ORTE_VPID_MAX UINT32_MAX-2
#define ORTE_VPID_MIN 0
#if ORTE_ENABLE_EPOCH
typedef uint32_t orte_epoch_t;
#define ORTE_EPOCH_T OPAL_UINT32
#define ORTE_EPOCH_MAX UINT32_MAX-2
#define ORTE_EPOCH_MIN 0
#endif
#if ORTE_ENABLE_EPOCH
#define ORTE_PROCESS_NAME_HTON(n) \
do { \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid); \
n.epoch = htonl(n.epoch); \
} while (0)
#else
#define ORTE_PROCESS_NAME_HTON(n) \
do { \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid); \
} while (0)
#endif
#if ORTE_ENABLE_EPOCH
#define ORTE_PROCESS_NAME_NTOH(n) \
do { \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid); \
n.epoch = ntohl(n.epoch); \
} while (0)
#else
#define ORTE_PROCESS_NAME_NTOH(n) \
do { \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid); \
} while (0)
#endif
#define ORTE_NAME_ARGS(n) \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
@ -127,6 +146,7 @@ do { \
struct orte_process_name_t {
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process id - equivalent to rank */
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
* The epoch will start at ORTE_EPOCH_MIN and
* increment every time the process is detected as
@ -135,6 +155,7 @@ struct orte_process_name_t {
* processes that did not directly detect the
* failure to increment their epochs.
*/
#endif
};
typedef struct orte_process_name_t orte_process_name_t;
@ -157,7 +178,10 @@ typedef void* orte_iov_base_ptr_t;
#define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an orte_process_name_t */
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
#if ORTE_ENABLE_EPOCH
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
/* State-related types */

View File

@ -386,7 +386,7 @@ static void recv_cmd(int status,
dat = OBJ_NEW(orte_db_data_t);
dat->name.jobid = sender->jobid;
dat->name.vpid = sender->vpid;
dat->name.epoch= sender->epoch;
ORTE_EPOCH_SET(dat->name.epoch,sender->epoch);
dat->key = key;
count=1;
opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32);

View File

@ -82,8 +82,10 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_set_fault_callback
orte_errmgr_base_register_migration_warning
#if ORTE_RESIL_ORTE
,orte_errmgr_base_set_fault_callback
#endif
};
/************************
@ -93,18 +95,23 @@ static int init(void)
{
int ret = ORTE_SUCCESS;
#if ORTE_RESIL_ORTE
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
#endif
return ret;
}
static int finalize(void)
{
#if ORTE_RESIL_ORTE
orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_EPOCH_CHANGE);
#endif
return ORTE_SUCCESS;
}
@ -151,6 +158,7 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
@ -209,15 +217,20 @@ void epoch_change(int fd,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else if (NULL == fault_cbfunc) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed (NULL pointer)!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
"%s errmgr:app Calling fault callback failed (num_dead <= 0)!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
#endif
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{

View File

@ -97,13 +97,13 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN);
}
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID);
item->proc_name.jobid = ORTE_JOBID_INVALID;
}
@ -139,13 +139,13 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN);
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->node_name = NULL;
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN);
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
item->map_node_name = NULL;
@ -156,7 +156,7 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID);
item->proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->node_name ) {
@ -165,7 +165,7 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
}
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID);
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->map_node_name ) {

View File

@ -267,7 +267,7 @@ static int errmgr_base_tool_start_cmdline_listener(void)
*/
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
errmgr_cmdline_sender.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN);
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE,
0,
@ -379,14 +379,14 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
swap_dest.jobid = errmgr_cmdline_sender.jobid;
swap_dest.vpid = errmgr_cmdline_sender.vpid;
swap_dest.epoch = errmgr_cmdline_sender.epoch;
ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch);
errmgr_cmdline_sender = *sender;
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
errmgr_cmdline_sender.jobid = swap_dest.jobid;
errmgr_cmdline_sender.vpid = swap_dest.vpid;
errmgr_cmdline_sender.epoch = swap_dest.epoch;
ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch);
goto cleanup;
}

View File

@ -53,6 +53,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -83,9 +84,11 @@ static orte_errmgr_base_module_t global_module = {
orte_errmgr_hnp_global_suggest_map_targets,
/* FT Event hook */
orte_errmgr_hnp_global_ft_event,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_migration_warning
#if ORTE_RESIL_ORTE
/* Set the callback */
orte_errmgr_base_set_fault_callback
,orte_errmgr_base_set_fault_callback
#endif
};
@ -97,14 +100,16 @@ static void failed_start(orte_job_t *jdata);
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate,
orte_proc_state_t state, orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
#if ORTE_RESIL_ORTE
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
#endif
/************************
* API Definitions
@ -380,16 +385,21 @@ cleanup:
**********************/
int orte_errmgr_hnp_base_global_init(void)
{
int ret;
int ret = ORTE_SUCCESS;
#if ORTE_RESIL_ORTE
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE,
ORTE_RML_PERSISTENT, failure_notification, NULL);
#endif
return ret;
}
int orte_errmgr_hnp_base_global_finalize(void)
{
#if ORTE_RESIL_ORTE
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE);
#endif
return ORTE_SUCCESS;
}
@ -406,6 +416,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
orte_odls_child_t *child;
int rc;
orte_app_context_t *app;
orte_proc_t *pdat;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported state %s"
@ -538,7 +549,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
exit_code);
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -550,7 +561,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
break;
case ORTE_JOB_STATE_COMM_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -562,7 +573,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
break;
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -632,10 +643,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
}
}
if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) {
exit_code = 0;
}
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
@ -679,7 +686,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
if (jdata->enable_recovery) {
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its restart limit */
@ -778,18 +785,37 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
}
} else {
#if !ORTE_RESIL_ORTE
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid), "Unknown");
} else {
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid),
(NULL == pdat->node) ? "Unknown" :
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
}
#endif
if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) {
/* The process is already dead so don't keep trying to do
* this stuff. */
return ORTE_SUCCESS;
}
#if !ORTE_RESIL_ORTE
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
#endif
/* We'll check if the job was complete when we get the
* message back from the HNP notifying us of the dead
* process */
@ -805,7 +831,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
} else {
orte_errmgr_hnp_record_dead_process(proc);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
return ORTE_ERR_UNRECOVERABLE;
@ -824,6 +850,7 @@ int orte_errmgr_hnp_base_global_ft_event(int state)
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
@ -984,6 +1011,7 @@ static void failure_notification(int status, orte_process_name_t* sender,
OBJ_RELEASE(dead_names);
}
#endif
/*****************
* Local Functions
@ -1354,7 +1382,6 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
#if 0
case ORTE_PROC_STATE_ABORTED_BY_SIG:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s aborted by signal",
@ -1370,7 +1397,6 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
#endif
case ORTE_PROC_STATE_TERM_WO_SYNC:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s terminated without sync",
@ -1393,7 +1419,6 @@ static void check_job_complete(orte_job_t *jdata)
}
break;
case ORTE_PROC_STATE_COMM_FAILED:
#if 1
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
@ -1403,7 +1428,6 @@ static void check_job_complete(orte_job_t *jdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
#endif
break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
if (!jdata->abort) {
@ -1530,9 +1554,6 @@ static void check_job_complete(orte_job_t *jdata)
*/
CHECK_DAEMONS:
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
#if 0
if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */
#endif
if (0 == orte_routed.num_routes()) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -1696,7 +1717,7 @@ CHECK_ALIVE:
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
@ -1707,7 +1728,9 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_JOBID_WILDCARD == job
&& ORTE_VPID_WILDCARD == vpid
&& ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
@ -1718,7 +1741,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
ORTE_EPOCH_SET(proc.name.epoch,epoch);
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -1913,13 +1936,15 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
}
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) &&
ORTE_PROC_STATE_TERMINATED < pdat->state) {
ORTE_PROC_STATE_TERMINATED > pdat->state) {
#if ORTE_ENABLE_EPOCH
/* Make sure that the epochs match. */
if (proc->epoch != pdat->name.epoch) {
opal_output(1, "The epoch does not match the current epoch. Throwing the request out.");
return ORTE_SUCCESS;
}
#endif
dead_names = OBJ_NEW(opal_pointer_array_t);
@ -1935,6 +1960,7 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
}
}
#if ORTE_RESIL_ORTE
if (!mca_errmgr_hnp_component.term_in_progress) {
/*
* Send a message to the other daemons so they know that a daemon has
@ -1949,7 +1975,7 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
OBJ_RELEASE(buffer);
} else {
/* Iterate of the list of dead procs and send them along with
/* Iterate over the list of dead procs and send them along with
* the rest. The HNP needs this info so it can tell the other
* ORTEDs and they can inform the appropriate applications.
*/
@ -1973,6 +1999,9 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
} else {
orte_errmgr_hnp_global_mark_processes_as_dead(dead_names);
}
#else
orte_errmgr_hnp_global_mark_processes_as_dead(dead_names);
#endif
}
return ORTE_SUCCESS;
@ -2011,6 +2040,7 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pdat->name)));
#if ORTE_RESIL_ORTE
/* Make sure the epochs match, if not it probably means that we
* already reported this failure. */
if (name_item->epoch != pdat->name.epoch) {
@ -2018,6 +2048,7 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
}
orte_util_set_epoch(name_item, name_item->epoch + 1);
#endif
/* Remove it from the job array */
opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL);
@ -2034,6 +2065,7 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
OBJ_RELEASE(pdat);
#if ORTE_RESIL_ORTE
/* Create a new proc object that will keep track of the epoch
* information */
pdat = OBJ_NEW(orte_proc_t);
@ -2041,14 +2073,15 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
pdat->name.vpid = name_item->vpid;
pdat->name.epoch = name_item->epoch + 1;
/* Set the state as terminated so we'll know the process isn't
* actually there. */
pdat->state = ORTE_PROC_STATE_TERMINATED;
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
jdat->num_procs++;
jdat->num_terminated++;
#endif
/* Set the state as terminated so we'll know the process isn't
* actually there. */
pdat->state = ORTE_PROC_STATE_TERMINATED;
} else {
#if ORTE_RESIL_ORTE
opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item));
/* Create a new proc object that will keep track of the epoch
* information */
@ -2064,11 +2097,13 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
jdat->num_procs++;
jdat->num_terminated++;
#endif
}
check_job_complete(jdat);
}
#if ORTE_RESIL_ORTE
if (!orte_orteds_term_ordered) {
/* Need to update the orted routing module. */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
@ -2077,10 +2112,12 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
(*fault_cbfunc)(dead_procs);
}
}
#endif
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret = ORTE_SUCCESS;
@ -2121,3 +2158,5 @@ int send_to_local_applications(opal_pointer_array_t *dead_names) {
return ret;
}
#endif

View File

@ -522,7 +522,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
wp_item->name.jobid = proc->jobid;
wp_item->name.vpid = proc->vpid;
wp_item->name.epoch = proc->epoch;
ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch);
wp_item->state = state;
opal_list_append(procs_pending_recovery, &(wp_item->super));
@ -626,7 +626,7 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN);
wp->state = 0;
}
@ -635,7 +635,7 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID);
wp->state = 0;
}

View File

@ -750,7 +750,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
@ -807,7 +807,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
@ -855,7 +855,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}

View File

@ -34,6 +34,7 @@
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -41,7 +42,9 @@
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -59,13 +62,15 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int record_dead_process(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
#if ORTE_RESIL_ORTE
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
#endif
/*
* Module functions: Global
@ -104,8 +109,10 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
predicted_fault,
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_set_fault_callback /* Set callback function */
orte_errmgr_base_register_migration_warning
#if ORTE_RESIL_ORTE
,orte_errmgr_base_set_fault_callback /* Set callback function */
#endif
};
/************************
@ -113,16 +120,22 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
************************/
static int init(void)
{
int ret;
int ret = ORTE_SUCCESS;
#if ORTE_RESIL_ORTE
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE,
ORTE_RML_PERSISTENT, failure_notification, NULL);
#endif
return ret;
}
static int finalize(void)
{
#if ORTE_RESIL_ORTE
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE);
#endif
return ORTE_SUCCESS;
}
@ -228,10 +241,10 @@ static int update_state(orte_jobid_t job,
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
break;
@ -276,7 +289,7 @@ static int update_state(orte_jobid_t job,
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* terminate - our routed children will see
* us leave and automatically die
*/
@ -290,10 +303,18 @@ static int update_state(orte_jobid_t job,
if (0 == orte_routed.num_routes() &&
0 == opal_list_get_size(&orte_local_children)) {
orte_quit();
} else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted not exiting, num_routes() == %d, num children == %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_routed.num_routes(),
opal_list_get_size(&orte_local_children)));
}
}
#if ORTE_RESIL_ORTE
record_dead_process(proc);
#endif
/* if not, then indicate we can continue */
return ORTE_SUCCESS;
@ -344,7 +365,7 @@ static int update_state(orte_jobid_t job,
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
}
app = (orte_app_context_t*)