Add a resilience to ORTE. Allows the runtime to continue after a process (or
ORTED) failure. Note that more work will be necessary to allow the MPI layer to take advantage of this. Per RFC: http://www.open-mpi.org/community/lists/devel/2011/06/9299.php This commit was SVN r24815.
This commit is contained in:
parent
e8817f3f63
commit
e1ba09ad51
1
AUTHORS
1
AUTHORS
@ -85,6 +85,7 @@ tprins Tim Prins IU, LANL
|
||||
twoodall Tim Woodall LANL
|
||||
vasily Vasily Filipov Mellanox
|
||||
vsahay Vishal Sahay IU
|
||||
wbland Wesley Bland UTK
|
||||
yuw Weikuan Yu LANL, OSU
|
||||
|
||||
Affiliaion abbreviations:
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -234,6 +234,12 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
|
||||
return new_errhandler;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runtime errhandler callback
|
||||
*/
|
||||
void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs) {
|
||||
ompi_mpi_abort(MPI_COMM_WORLD, 1, false);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -35,6 +35,8 @@
|
||||
#include "ompi/errhandler/errhandler_predefined.h"
|
||||
#include "ompi/errhandler/errcode-internal.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
@ -358,6 +360,19 @@ struct ompi_request_t;
|
||||
OMPI_DECLSPEC ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
|
||||
ompi_errhandler_generic_handler_fn_t *func,
|
||||
ompi_errhandler_lang_t language);
|
||||
|
||||
/**
|
||||
* Callback function from runtime layer to alert the MPI layer of an error at
|
||||
* the runtime layer.
|
||||
*
|
||||
* @param procs The names of the processes that have failed.
|
||||
*
|
||||
* This function is used to alert the MPI layer to a specific fault at the
|
||||
* runtime layer. Currently, the only faults reported using this method are
|
||||
* process failures. The MPI layer has the option to perform whatever actions it
|
||||
* needs to stabalize itself and continue running, abort, etc.
|
||||
*/
|
||||
OMPI_DECLSPEC void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs);
|
||||
|
||||
/**
|
||||
* Check to see if an errhandler is intrinsic.
|
||||
|
@ -660,8 +660,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
|
||||
bool found = false;
|
||||
|
||||
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
|
||||
"jobid %d, vpid %d, sid %" PRIx64 ", lid %d",
|
||||
process_name->jobid, process_name->vpid, subnet_id, lid));
|
||||
"jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d",
|
||||
process_name->jobid, process_name->vpid, process_name->epoch, subnet_id, lid));
|
||||
/* find ibproc */
|
||||
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||
for (ib_proc = (mca_btl_openib_proc_t*)
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -1208,6 +1208,7 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
peer = OBJ_NEW(orte_namelist_t);
|
||||
peer->name.jobid = comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid;
|
||||
peer->name.vpid = comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid;
|
||||
peer->name.epoch = comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch;
|
||||
opal_list_append(&peers, &peer->item);
|
||||
}
|
||||
/* prepare send data */
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 The University of Tennessee and The University
|
||||
* Copyright (c) 2010-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -35,6 +35,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/dpm/dpm.h"
|
||||
@ -701,6 +702,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
|
||||
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
peer_rev->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
|
||||
@ -728,6 +730,7 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
|
||||
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
peer_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
|
||||
HOKE_TRAFFIC_MSG_REF_RETURN(item);
|
||||
@ -837,6 +840,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->matched = INVALID_INT;
|
||||
msg_ref->done = INVALID_INT;
|
||||
@ -864,6 +868,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->matched = INVALID_INT;
|
||||
msg_ref->done = INVALID_INT;
|
||||
@ -897,6 +902,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->done = INVALID_INT;
|
||||
msg_ref->active = INVALID_INT;
|
||||
@ -928,6 +934,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->done = INVALID_INT;
|
||||
msg_ref->active = INVALID_INT;
|
||||
@ -947,6 +954,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
|
||||
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
|
||||
}
|
||||
|
||||
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
|
||||
@ -954,6 +962,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
|
||||
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
|
||||
}
|
||||
|
||||
|
||||
@ -1006,7 +1015,7 @@ do { \
|
||||
}
|
||||
|
||||
|
||||
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
|
||||
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
|
||||
{ \
|
||||
HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \
|
||||
\
|
||||
@ -1025,6 +1034,7 @@ do { \
|
||||
\
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
msg_ref->proc_name.epoch = p_epoch; \
|
||||
\
|
||||
msg_ref->matched = 0; \
|
||||
msg_ref->done = 0; \
|
||||
@ -1033,7 +1043,7 @@ do { \
|
||||
msg_ref->active_drain = 0; \
|
||||
}
|
||||
|
||||
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
|
||||
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
|
||||
{ \
|
||||
HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \
|
||||
\
|
||||
@ -1053,6 +1063,7 @@ do { \
|
||||
\
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
msg_ref->proc_name.epoch = p_epoch; \
|
||||
}
|
||||
|
||||
|
||||
@ -1455,6 +1466,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
|
||||
|
||||
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
|
||||
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
|
||||
new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch;
|
||||
|
||||
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
|
||||
}
|
||||
@ -3225,7 +3237,8 @@ static int traffic_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
CREATE_NEW_MSG((*msg_ref), msg_type,
|
||||
count, ddt_size, tag, dest, comm,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
peer_ref->proc_name.vpid
|
||||
peer_ref->proc_name.epoch);
|
||||
} else {
|
||||
CREATE_NEW_MSG((*msg_ref), msg_type,
|
||||
count, ddt_size, tag, dest, comm,
|
||||
@ -3364,6 +3377,7 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
|
||||
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
|
||||
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
|
||||
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
|
||||
(*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
@ -3794,7 +3808,8 @@ static int drain_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type,
|
||||
count, NULL, tag, dest, comm,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
peer_ref->proc_name.vpid
|
||||
peer_ref->proc_name.epoch);
|
||||
|
||||
(*msg_ref)->done = 0;
|
||||
(*msg_ref)->active = 0;
|
||||
@ -4142,6 +4157,7 @@ static int drain_message_copy_remove(ompi_crcp_bkmrk_pml_drain_message_ref_t *dr
|
||||
static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
|
||||
{
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(&ompi_crcp_bkmrk_pml_peer_refs);
|
||||
item != opal_list_get_end(&ompi_crcp_bkmrk_pml_peer_refs);
|
||||
@ -4149,7 +4165,9 @@ static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
|
||||
ompi_crcp_bkmrk_pml_peer_ref_t *cur_peer_ref;
|
||||
cur_peer_ref = (ompi_crcp_bkmrk_pml_peer_ref_t*)item;
|
||||
|
||||
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
|
||||
if( OPAL_EQUAL == orte_util_compare_name_fields(mask,
|
||||
&(cur_peer_ref->proc_name),
|
||||
&proc) ) {
|
||||
return cur_peer_ref;
|
||||
@ -5266,6 +5284,7 @@ static int send_bookmarks(int peer_idx)
|
||||
*/
|
||||
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
|
||||
|
||||
if( NULL == (peer_ref = find_peer(peer_name))) {
|
||||
opal_output(mca_crcp_bkmrk_component.super.output_handle,
|
||||
@ -5326,6 +5345,7 @@ static int recv_bookmarks(int peer_idx)
|
||||
|
||||
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
|
||||
|
||||
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
|
||||
OMPI_CRCP_COORD_BOOKMARK_TAG,
|
||||
@ -5507,6 +5527,7 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
|
||||
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
|
||||
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
|
||||
d_msg_ack->peer.epoch = peer_ref->proc_name.epoch;
|
||||
d_msg_ack->complete = false;
|
||||
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle,
|
||||
@ -6146,7 +6167,8 @@ static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t *peer_r
|
||||
count, datatype_size, tag, rank,
|
||||
ompi_comm_lookup(comm_id),
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
peer_ref->proc_name.vpid
|
||||
peer_ref->proc_name.epoch);
|
||||
|
||||
traffic_message_create_drain_message(true, num_left_unresolved,
|
||||
peer_ref,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -1130,6 +1130,7 @@ static void process_cb(int fd, short event, void *data)
|
||||
/* flag the identity of the remote proc */
|
||||
carport.jobid = mev->sender.jobid;
|
||||
carport.vpid = mev->sender.vpid;
|
||||
carport.epoch = mev->sender.epoch;
|
||||
|
||||
/* release the event */
|
||||
OBJ_RELEASE(mev);
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -395,12 +398,13 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
|
||||
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
|
||||
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
|
||||
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
|
||||
orte_proc.epoch = hdr->hdr_restart.hdr_epoch;
|
||||
ompi_proc = ompi_proc_find(&orte_proc);
|
||||
opal_output_verbose(20, mca_pml_bfo_output,
|
||||
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
|
||||
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
|
||||
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
|
||||
"hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
|
||||
"hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, ompi_proc->proc_hostname=%s",
|
||||
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
|
||||
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||||
@ -408,8 +412,8 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
|
||||
hdr->hdr_restart.hdr_restartseq,
|
||||
recvreq->remote_req_send.pval, (void *)recvreq,
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||||
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
|
||||
ompi_proc->proc_hostname);
|
||||
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
|
||||
hdr->hdr_restart.hdr_epoch, ompi_proc->proc_hostname);
|
||||
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
|
||||
return;
|
||||
}
|
||||
@ -711,6 +715,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
|
||||
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
|
||||
restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
|
||||
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -415,6 +415,7 @@ struct mca_pml_bfo_restart_hdr_t {
|
||||
int32_t hdr_dst_rank; /**< needed to send NACK */
|
||||
uint32_t hdr_jobid; /**< needed to send NACK */
|
||||
uint32_t hdr_vpid; /**< needed to send NACK */
|
||||
uint32_t hdr_epoch; /**< needed to send NACK */
|
||||
};
|
||||
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
|
||||
@ -427,6 +428,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
|
||||
(h).hdr_jobid = ntohl((h).hdr_jobid); \
|
||||
(h).hdr_vpid = ntohl((h).hdr_vpid); \
|
||||
(h).hdr_epoch = ntohl((h).hdr_epoch); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
|
||||
@ -435,6 +437,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
|
||||
(h).hdr_jobid = htonl((h).hdr_jobid); \
|
||||
(h).hdr_vpid = htonl((h).hdr_vpid); \
|
||||
(h).hdr_epoch = htonl((h).hdr_epoch); \
|
||||
} while (0)
|
||||
|
||||
#endif /* PML_BFO */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
|
||||
@ -108,6 +108,7 @@ int ompi_proc_init(void)
|
||||
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
proc->proc_name.vpid = i;
|
||||
proc->proc_name.epoch = ORTE_EPOCH_MIN;
|
||||
if (i == ORTE_PROC_MY_NAME->vpid) {
|
||||
ompi_proc_local_proc = proc;
|
||||
proc->proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||
@ -361,6 +362,8 @@ int ompi_proc_refresh(void) {
|
||||
|
||||
/* Does not change: proc->proc_name.vpid */
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
|
||||
|
||||
/* Make sure to clear the local flag before we set it below */
|
||||
proc->proc_flags = 0;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -67,6 +67,7 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/info/info.h"
|
||||
#include "ompi/errhandler/errcode.h"
|
||||
#include "ompi/errhandler/errhandler.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/mca/op/op.h"
|
||||
@ -369,6 +370,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
gettimeofday(&ompistart, NULL);
|
||||
}
|
||||
|
||||
/* Register errhandler callback with orte errmgr */
|
||||
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
|
||||
|
||||
/* Figure out the final MPI thread levels. If we were not
|
||||
compiled for support for MPI threads, then don't allow
|
||||
MPI_THREAD_MULTIPLE. Set this stuff up here early in the
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -81,28 +81,36 @@ typedef uint32_t orte_vpid_t;
|
||||
#define ORTE_VPID_T OPAL_UINT32
|
||||
#define ORTE_VPID_MAX UINT32_MAX-2
|
||||
#define ORTE_VPID_MIN 0
|
||||
typedef uint32_t orte_epoch_t;
|
||||
#define ORTE_EPOCH_T OPAL_UINT32
|
||||
#define ORTE_EPOCH_MAX UINT32_MAX-2
|
||||
#define ORTE_EPOCH_MIN 0
|
||||
|
||||
#define ORTE_PROCESS_NAME_HTON(n) \
|
||||
do { \
|
||||
n.jobid = htonl(n.jobid); \
|
||||
n.vpid = htonl(n.vpid); \
|
||||
n.epoch = htonl(n.epoch); \
|
||||
} while (0)
|
||||
|
||||
#define ORTE_PROCESS_NAME_NTOH(n) \
|
||||
do { \
|
||||
n.jobid = ntohl(n.jobid); \
|
||||
n.vpid = ntohl(n.vpid); \
|
||||
n.epoch = ntohl(n.epoch); \
|
||||
} while (0)
|
||||
|
||||
#define ORTE_NAME_ARGS(n) \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid)
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch)
|
||||
|
||||
/*
|
||||
* define invalid values
|
||||
*/
|
||||
#define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2)
|
||||
#define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2)
|
||||
#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2)
|
||||
#define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF)
|
||||
|
||||
/*
|
||||
@ -110,6 +118,7 @@ do { \
|
||||
*/
|
||||
#define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1)
|
||||
#define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1)
|
||||
#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1)
|
||||
#define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF)
|
||||
|
||||
/*
|
||||
@ -118,6 +127,14 @@ do { \
|
||||
struct orte_process_name_t {
|
||||
orte_jobid_t jobid; /**< Job number */
|
||||
orte_vpid_t vpid; /**< Process id - equivalent to rank */
|
||||
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
|
||||
* The epoch will start at ORTE_EPOCH_MIN and
|
||||
* increment every time the process is detected as
|
||||
* having stopped (including normal shutdown). The
|
||||
* HNP will be responsible for informing all
|
||||
* processes that did not directly detect the
|
||||
* failure to increment their epochs.
|
||||
*/
|
||||
};
|
||||
typedef struct orte_process_name_t orte_process_name_t;
|
||||
|
||||
@ -140,35 +157,35 @@ typedef void* orte_iov_base_ptr_t;
|
||||
#define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an orte_process_name_t */
|
||||
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
|
||||
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
|
||||
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* State-related types */
|
||||
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 5) /**< node status flag */
|
||||
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< process/resource status */
|
||||
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< job status flag */
|
||||
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 8) /**< process exit code */
|
||||
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */
|
||||
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< process/resource status */
|
||||
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 8) /**< job status flag */
|
||||
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 9) /**< process exit code */
|
||||
/* Data-passing types */
|
||||
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 9) /**< registry return value */
|
||||
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 10) /**< registry return value */
|
||||
/* Resource types */
|
||||
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 10) /**< argv and enviro arrays */
|
||||
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 11) /**< describes capabilities of nodes */
|
||||
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes slot allocations/reservations */
|
||||
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 13) /**< job information */
|
||||
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 14) /**< node information */
|
||||
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 15) /**< process information */
|
||||
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 16) /**< map of process locations */
|
||||
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 11) /**< argv and enviro arrays */
|
||||
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes capabilities of nodes */
|
||||
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 13) /**< describes slot allocations/reservations */
|
||||
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 14) /**< job information */
|
||||
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 15) /**< node information */
|
||||
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 16) /**< process information */
|
||||
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 17) /**< map of process locations */
|
||||
|
||||
/* RML types */
|
||||
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 17) /**< tag for sending/receiving messages */
|
||||
|
||||
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 18) /**< tag for sending/receiving messages */
|
||||
/* DAEMON command type */
|
||||
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 18) /**< command flag for communicating with the daemon */
|
||||
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */
|
||||
|
||||
/* GRPCOMM types */
|
||||
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 19)
|
||||
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20)
|
||||
|
||||
/* IOF types */
|
||||
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20)
|
||||
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21)
|
||||
|
||||
|
||||
/* provide a boundary for others to use */
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -383,6 +386,7 @@ static void recv_cmd(int status,
|
||||
dat = OBJ_NEW(orte_db_data_t);
|
||||
dat->name.jobid = sender->jobid;
|
||||
dat->name.vpid = sender->vpid;
|
||||
dat->name.epoch= sender->epoch;
|
||||
dat->key = key;
|
||||
count=1;
|
||||
opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32);
|
||||
|
@ -1,9 +1,13 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -22,11 +26,15 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
@ -48,9 +56,22 @@ static int update_state(orte_jobid_t job,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
static int post_startup(void);
|
||||
static int pre_shutdown(void);
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata);
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
@ -64,7 +85,12 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
post_startup,
|
||||
pre_shutdown,
|
||||
NULL,
|
||||
orte_errmgr_base_set_fault_callback,
|
||||
NULL
|
||||
};
|
||||
|
||||
/************************
|
||||
@ -87,6 +113,8 @@ static int update_state(orte_jobid_t job,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
@ -104,9 +132,9 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
/* if it is our own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
|
||||
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -120,6 +148,95 @@ static int update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int post_startup(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE,
|
||||
ORTE_RML_PERSISTENT,
|
||||
epoch_change_recv,
|
||||
NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int pre_shutdown(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata) {
|
||||
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
|
||||
}
|
||||
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data) {
|
||||
orte_message_event_t *mev = (orte_message_event_t *) data;
|
||||
opal_buffer_t *buffer = mev->buffer;
|
||||
orte_process_name_t *proc;
|
||||
int n = 1, ret, num_dead, i;
|
||||
opal_pointer_array_t *procs;
|
||||
|
||||
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Received epoch change notification",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
procs = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
|
||||
for (i = 0; i < num_dead; i++) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
proc[i].epoch++;
|
||||
orte_util_set_epoch(&proc[i], proc[i].epoch);
|
||||
|
||||
opal_pointer_array_add(procs, &proc[i]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Epoch for %s updated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc[i])));
|
||||
}
|
||||
|
||||
if (NULL != fault_cbfunc && 0 < num_dead) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
(*fault_cbfunc)(procs);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback failed!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
free(proc);
|
||||
OBJ_RELEASE(procs);
|
||||
}
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
@ -161,7 +278,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
return exit_status;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -105,6 +105,7 @@ ORTE_DECLSPEC void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, o
|
||||
/*
|
||||
* Additional External API function declared in errmgr.h
|
||||
*/
|
||||
ORTE_DECLSPEC orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -97,11 +97,13 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
|
||||
@ -137,11 +139,13 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
|
||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->node_name = NULL;
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->map_node_name = NULL;
|
||||
@ -152,6 +156,7 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->node_name ) {
|
||||
@ -160,6 +165,7 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
}
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->map_node_name ) {
|
||||
@ -678,6 +684,18 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
|
||||
|
||||
#endif
|
||||
|
||||
orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc) {
|
||||
orte_errmgr_fault_callback_t *temp_cbfunc = fault_cbfunc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s errmgr:base Called set_fault_callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
fault_cbfunc = cbfunc;
|
||||
|
||||
return temp_cbfunc;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Local Functions
|
||||
********************/
|
||||
|
@ -55,6 +55,8 @@ orte_errmgr_base_t orte_errmgr_base;
|
||||
|
||||
orte_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
|
||||
orte_errmgr_fault_callback_t *fault_cbfunc;
|
||||
|
||||
/* Public module provides a wrapper around previous functions */
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
NULL, /* init */
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -264,6 +267,7 @@ static int errmgr_base_tool_start_cmdline_listener(void)
|
||||
*/
|
||||
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
|
||||
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
|
||||
errmgr_cmdline_sender.epoch = ORTE_EPOCH_INVALID;
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_MIGRATE,
|
||||
0,
|
||||
@ -375,12 +379,14 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
|
||||
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
||||
swap_dest.jobid = errmgr_cmdline_sender.jobid;
|
||||
swap_dest.vpid = errmgr_cmdline_sender.vpid;
|
||||
swap_dest.epoch = errmgr_cmdline_sender.epoch;
|
||||
|
||||
errmgr_cmdline_sender = *sender;
|
||||
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
|
||||
|
||||
errmgr_cmdline_sender.jobid = swap_dest.jobid;
|
||||
errmgr_cmdline_sender.vpid = swap_dest.vpid;
|
||||
errmgr_cmdline_sender.epoch = swap_dest.epoch;
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -49,6 +49,7 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
@ -90,6 +91,22 @@ struct orte_errmgr_predicted_node_t {
|
||||
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
|
||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
|
||||
|
||||
/*
|
||||
* Callback function that should be called when there is a fault.
|
||||
*
|
||||
* This callback function will be used anytime (other than during finalize) the
|
||||
* runtime detects and handles a process failure. The runtime will complete all
|
||||
* its stabilization before alerting the callback function. The parameter to the
|
||||
* callback function will be the orte_process_name_t of the process that failed.
|
||||
* It will not alert the application to failures that are not in the same job as
|
||||
* the alerted process, only failures within the same jobid.
|
||||
*
|
||||
* @param[in] proc The names of the process that failed
|
||||
*/
|
||||
typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
|
||||
|
||||
/*
|
||||
* Structure to describe a suggested remapping element for a predicted fault.
|
||||
*
|
||||
@ -242,42 +259,100 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
|
||||
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
||||
|
||||
/**
|
||||
* Register a callback to alert caller when ORTE is preparing to
|
||||
* migrate the process to another location. This provides an
|
||||
* opportunity for the process to checkpoint any required state,
|
||||
* and to cleanly shutdown.
|
||||
* Function to perform actions that require the rest of the ORTE layer to be up
|
||||
* and running.
|
||||
*
|
||||
* @param[in] delay Time to delay before assuming process is stuck
|
||||
* and cannot exit on its own - and thus, go
|
||||
* ahead and migrate it
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecified error occured
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
|
||||
|
||||
/*
|
||||
* This function gets called just after startup is finished. It gives the errmgr
|
||||
* a chance to setup anything that requires ORTE to actually be ready to go such
|
||||
* as registering callbacks, posting receives, etc.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_post_startup_t)(void);
|
||||
|
||||
/*
|
||||
* This function gets called just before shutdown begins. It gives the errmgr a
|
||||
* chance to clean up anything that it did after startup, i.e. deregistering
|
||||
* callbacks, cleaning up receives, etc.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_pre_shutdown_t)(void);
|
||||
|
||||
/**
|
||||
* Function to mark a list of processes as dead and perform any internal cleanup
|
||||
* necessary.
|
||||
*
|
||||
* @param[in] dead_procs Process list that is being marked as dead.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully.
|
||||
* @retval ORTE_ERROR An unspecified error occurred.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_mark_processes_as_dead_t)(opal_pointer_array_t *dead_procs);
|
||||
|
||||
/**
|
||||
* Set the callback function for faults.
|
||||
*
|
||||
* @param[in] cbfunc The callback function.
|
||||
*
|
||||
* @retval The previous fault callback function.
|
||||
*/
|
||||
typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
|
||||
|
||||
/**
|
||||
* Receive updates about failure notifications.
|
||||
*
|
||||
* @param[in] sender The process who originally sent the failure notification.
|
||||
* @param[in] buffer The buffer containing all the information about the failed process.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully.
|
||||
* @retval ORTE_ERROR An unspecified error occurred.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_failure_notification_t)(orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer);
|
||||
|
||||
/*
|
||||
* Module Structure
|
||||
*/
|
||||
struct orte_errmgr_base_module_2_3_0_t {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_base_module_init_fn_t init;
|
||||
orte_errmgr_base_module_init_fn_t init;
|
||||
/** Finalization Function */
|
||||
orte_errmgr_base_module_finalize_fn_t finalize;
|
||||
orte_errmgr_base_module_finalize_fn_t finalize;
|
||||
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||
|
||||
/** Actual process failure notification */
|
||||
orte_errmgr_base_module_update_state_fn_t update_state;
|
||||
orte_errmgr_base_module_update_state_fn_t update_state;
|
||||
/** Predicted process/node failure notification */
|
||||
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
||||
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
||||
/** Suggest a node to map a restarting process onto */
|
||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
||||
|
||||
/** Handle any FT Notifications */
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
|
||||
/* Register to be warned of impending migration */
|
||||
/* Register to be warned of impending migration */
|
||||
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
|
||||
|
||||
/** Perform post-statup operations */
|
||||
orte_errmgr_base_module_post_startup_t post_startup;
|
||||
|
||||
/** Perform pre-shutdown operations */
|
||||
orte_errmgr_base_module_pre_shutdown_t pre_shutdown;
|
||||
|
||||
/* Mark a process as dead. */
|
||||
orte_errmgr_base_module_mark_processes_as_dead_t mark_processes_as_dead;
|
||||
|
||||
/* Set the callback function */
|
||||
orte_errmgr_base_module_set_fault_callback_t set_fault_callback;
|
||||
|
||||
/* Receive failure notification */
|
||||
orte_errmgr_base_module_failure_notification_t failure_notification;
|
||||
};
|
||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -57,10 +60,6 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
|
||||
orte_vpid_t vpid,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/***************************
|
||||
* Module functions: Global
|
||||
@ -81,6 +80,10 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnp_global_ft_event(int state);
|
||||
int orte_errmgr_hnp_global_post_startup(void);
|
||||
int orte_errmgr_hnp_global_pre_shutdown(void);
|
||||
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
|
||||
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
|
||||
|
||||
/* HNP Versions */
|
||||
int orte_errmgr_hnp_base_global_init(void);
|
||||
|
@ -2,6 +2,9 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||