1
1

Add a resilience to ORTE. Allows the runtime to continue after a process (or

ORTED) failure. Note that more work will be necessary to allow the MPI layer to
take advantage of this.

Per RFC:
http://www.open-mpi.org/community/lists/devel/2011/06/9299.php

This commit was SVN r24815.
This commit is contained in:
Wesley Bland 2011-06-23 20:38:02 +00:00
parent e8817f3f63
commit e1ba09ad51
130 changed files with 2783 additions and 709 deletions

View File

@ -85,6 +85,7 @@ tprins Tim Prins IU, LANL
twoodall Tim Woodall LANL
vasily Vasily Filipov Mellanox
vsahay Vishal Sahay IU
wbland Wesley Bland UTK
yuw Weikuan Yu LANL, OSU
Affiliaion abbreviations:

View File

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -234,6 +234,12 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
return new_errhandler;
}
/**
* Runtime errhandler callback
*/
void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs) {
ompi_mpi_abort(MPI_COMM_WORLD, 1, false);
}
/**************************************************************************
*

View File

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -35,6 +35,8 @@
#include "ompi/errhandler/errhandler_predefined.h"
#include "ompi/errhandler/errcode-internal.h"
#include "orte/types.h"
BEGIN_C_DECLS
/*
@ -358,6 +360,19 @@ struct ompi_request_t;
OMPI_DECLSPEC ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
ompi_errhandler_generic_handler_fn_t *func,
ompi_errhandler_lang_t language);
/**
* Callback function from runtime layer to alert the MPI layer of an error at
* the runtime layer.
*
* @param procs The names of the processes that have failed.
*
* This function is used to alert the MPI layer to a specific fault at the
* runtime layer. Currently, the only faults reported using this method are
* process failures. The MPI layer has the option to perform whatever actions it
* needs to stabalize itself and continue running, abort, etc.
*/
OMPI_DECLSPEC void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs);
/**
* Check to see if an errhandler is intrinsic.

View File

@ -660,8 +660,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
bool found = false;
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
"jobid %d, vpid %d, sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid, subnet_id, lid));
"jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid, process_name->epoch, subnet_id, lid));
/* find ibproc */
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
for (ib_proc = (mca_btl_openib_proc_t*)

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -1208,6 +1208,7 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
peer = OBJ_NEW(orte_namelist_t);
peer->name.jobid = comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid;
peer->name.vpid = comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid;
peer->name.epoch = comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch;
opal_list_append(&peers, &peer->item);
}
/* prepare send data */

View File

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 The University of Tennessee and The University
* Copyright (c) 2010-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
@ -35,6 +35,7 @@
#include "orte/util/name_fns.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ess/ess.h"
#include "ompi/request/request.h"
#include "ompi/mca/dpm/dpm.h"
@ -701,6 +702,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_rev->proc_name.epoch = ORTE_EPOCH_INVALID;
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
@ -728,6 +730,7 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
HOKE_TRAFFIC_MSG_REF_RETURN(item);
@ -837,6 +840,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -864,6 +868,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -897,6 +902,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -928,6 +934,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -947,6 +954,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
}
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
@ -954,6 +962,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
}
@ -1006,7 +1015,7 @@ do { \
}
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
{ \
HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1025,6 +1034,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
\
msg_ref->matched = 0; \
msg_ref->done = 0; \
@ -1033,7 +1043,7 @@ do { \
msg_ref->active_drain = 0; \
}
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
{ \
HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1053,6 +1063,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
}
@ -1455,6 +1466,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch;
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
}
@ -3225,7 +3237,8 @@ static int traffic_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
peer_ref->proc_name.vpid
peer_ref->proc_name.epoch);
} else {
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
@ -3364,6 +3377,7 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
(*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch;
}
return exit_status;
@ -3794,7 +3808,8 @@ static int drain_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type,
count, NULL, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
peer_ref->proc_name.vpid
peer_ref->proc_name.epoch);
(*msg_ref)->done = 0;
(*msg_ref)->active = 0;
@ -4142,6 +4157,7 @@ static int drain_message_copy_remove(ompi_crcp_bkmrk_pml_drain_message_ref_t *dr
static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
{
opal_list_item_t* item = NULL;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(&ompi_crcp_bkmrk_pml_peer_refs);
item != opal_list_get_end(&ompi_crcp_bkmrk_pml_peer_refs);
@ -4149,7 +4165,9 @@ static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
ompi_crcp_bkmrk_pml_peer_ref_t *cur_peer_ref;
cur_peer_ref = (ompi_crcp_bkmrk_pml_peer_ref_t*)item;
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
if( OPAL_EQUAL == orte_util_compare_name_fields(mask,
&(cur_peer_ref->proc_name),
&proc) ) {
return cur_peer_ref;
@ -5266,6 +5284,7 @@ static int send_bookmarks(int peer_idx)
*/
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
if( NULL == (peer_ref = find_peer(peer_name))) {
opal_output(mca_crcp_bkmrk_component.super.output_handle,
@ -5326,6 +5345,7 @@ static int recv_bookmarks(int peer_idx)
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
OMPI_CRCP_COORD_BOOKMARK_TAG,
@ -5507,6 +5527,7 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
d_msg_ack->peer.epoch = peer_ref->proc_name.epoch;
d_msg_ack->complete = false;
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle,
@ -6146,7 +6167,8 @@ static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t *peer_r
count, datatype_size, tag, rank,
ompi_comm_lookup(comm_id),
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
peer_ref->proc_name.vpid
peer_ref->proc_name.epoch);
traffic_message_create_drain_message(true, num_left_unresolved,
peer_ref,

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -1130,6 +1130,7 @@ static void process_cb(int fd, short event, void *data)
/* flag the identity of the remote proc */
carport.jobid = mev->sender.jobid;
carport.vpid = mev->sender.vpid;
carport.epoch = mev->sender.epoch;
/* release the event */
OBJ_RELEASE(mev);

View File

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -395,12 +398,13 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
orte_proc.epoch = hdr->hdr_restart.hdr_epoch;
ompi_proc = ompi_proc_find(&orte_proc);
opal_output_verbose(20, mca_pml_bfo_output,
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
"hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
"hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, ompi_proc->proc_hostname=%s",
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
@ -408,8 +412,8 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
hdr->hdr_restart.hdr_restartseq,
recvreq->remote_req_send.pval, (void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
ompi_proc->proc_hostname);
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
hdr->hdr_restart.hdr_epoch, ompi_proc->proc_hostname);
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
return;
}
@ -711,6 +715,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch;
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -415,6 +415,7 @@ struct mca_pml_bfo_restart_hdr_t {
int32_t hdr_dst_rank; /**< needed to send NACK */
uint32_t hdr_jobid; /**< needed to send NACK */
uint32_t hdr_vpid; /**< needed to send NACK */
uint32_t hdr_epoch; /**< needed to send NACK */
};
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
@ -427,6 +428,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
(h).hdr_jobid = ntohl((h).hdr_jobid); \
(h).hdr_vpid = ntohl((h).hdr_vpid); \
(h).hdr_epoch = ntohl((h).hdr_epoch); \
} while (0)
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
@ -435,6 +437,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
(h).hdr_jobid = htonl((h).hdr_jobid); \
(h).hdr_vpid = htonl((h).hdr_vpid); \
(h).hdr_epoch = htonl((h).hdr_epoch); \
} while (0)
#endif /* PML_BFO */

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -108,6 +108,7 @@ int ompi_proc_init(void)
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.vpid = i;
proc->proc_name.epoch = ORTE_EPOCH_MIN;
if (i == ORTE_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->proc_flags = OPAL_PROC_ALL_LOCAL;
@ -361,6 +362,8 @@ int ompi_proc_refresh(void) {
/* Does not change: proc->proc_name.vpid */
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
/* Make sure to clear the local flag before we set it below */
proc->proc_flags = 0;

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -67,6 +67,7 @@
#include "ompi/communicator/communicator.h"
#include "ompi/info/info.h"
#include "ompi/errhandler/errcode.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/request/request.h"
#include "ompi/op/op.h"
#include "ompi/mca/op/op.h"
@ -369,6 +370,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
gettimeofday(&ompistart, NULL);
}
/* Register errhandler callback with orte errmgr */
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
/* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow
MPI_THREAD_MULTIPLE. Set this stuff up here early in the

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -81,28 +81,36 @@ typedef uint32_t orte_vpid_t;
#define ORTE_VPID_T OPAL_UINT32
#define ORTE_VPID_MAX UINT32_MAX-2
#define ORTE_VPID_MIN 0
typedef uint32_t orte_epoch_t;
#define ORTE_EPOCH_T OPAL_UINT32
#define ORTE_EPOCH_MAX UINT32_MAX-2
#define ORTE_EPOCH_MIN 0
#define ORTE_PROCESS_NAME_HTON(n) \
do { \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid); \
n.epoch = htonl(n.epoch); \
} while (0)
#define ORTE_PROCESS_NAME_NTOH(n) \
do { \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid); \
n.epoch = ntohl(n.epoch); \
} while (0)
#define ORTE_NAME_ARGS(n) \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid)
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch)
/*
* define invalid values
*/
#define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2)
#define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2)
#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2)
#define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF)
/*
@ -110,6 +118,7 @@ do { \
*/
#define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1)
#define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1)
#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1)
#define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF)
/*
@ -118,6 +127,14 @@ do { \
struct orte_process_name_t {
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process id - equivalent to rank */
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
* The epoch will start at ORTE_EPOCH_MIN and
* increment every time the process is detected as
* having stopped (including normal shutdown). The
* HNP will be responsible for informing all
* processes that did not directly detect the
* failure to increment their epochs.
*/
};
typedef struct orte_process_name_t orte_process_name_t;
@ -140,35 +157,35 @@ typedef void* orte_iov_base_ptr_t;
#define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an orte_process_name_t */
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
#if !ORTE_DISABLE_FULL_SUPPORT
/* State-related types */
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 5) /**< node status flag */
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< process/resource status */
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< job status flag */
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 8) /**< process exit code */
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< process/resource status */
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 8) /**< job status flag */
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 9) /**< process exit code */
/* Data-passing types */
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 9) /**< registry return value */
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 10) /**< registry return value */
/* Resource types */
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 10) /**< argv and enviro arrays */
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 11) /**< describes capabilities of nodes */
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes slot allocations/reservations */
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 13) /**< job information */
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 14) /**< node information */
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 15) /**< process information */
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 16) /**< map of process locations */
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 11) /**< argv and enviro arrays */
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes capabilities of nodes */
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 13) /**< describes slot allocations/reservations */
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 14) /**< job information */
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 15) /**< node information */
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 16) /**< process information */
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 17) /**< map of process locations */
/* RML types */
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 17) /**< tag for sending/receiving messages */
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 18) /**< tag for sending/receiving messages */
/* DAEMON command type */
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 18) /**< command flag for communicating with the daemon */
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */
/* GRPCOMM types */
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 19)
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20)
/* IOF types */
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20)
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21)
/* provide a boundary for others to use */

View File

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -383,6 +386,7 @@ static void recv_cmd(int status,
dat = OBJ_NEW(orte_db_data_t);
dat->name.jobid = sender->jobid;
dat->name.vpid = sender->vpid;
dat->name.epoch= sender->epoch;
dat->key = key;
count=1;
opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32);

View File

@ -1,9 +1,13 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,11 +26,15 @@
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -48,9 +56,22 @@ static int update_state(orte_jobid_t job,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
@ -64,7 +85,12 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
};
/************************
@ -87,6 +113,8 @@ static int update_state(orte_jobid_t job,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",
@ -104,9 +132,9 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
@ -120,6 +148,95 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
@ -161,7 +278,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
goto cleanup;
}
cleanup:
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -105,6 +105,7 @@ ORTE_DECLSPEC void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, o
/*
* Additional External API function declared in errmgr.h
*/
ORTE_DECLSPEC orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc);
END_C_DECLS

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -97,11 +97,13 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
}
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
}
@ -137,11 +139,13 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->node_name = NULL;
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
item->map_node_name = NULL;
@ -152,6 +156,7 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->node_name ) {
@ -160,6 +165,7 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
}
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->map_node_name ) {
@ -678,6 +684,18 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
#endif
orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc) {
orte_errmgr_fault_callback_t *temp_cbfunc = fault_cbfunc;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s errmgr:base Called set_fault_callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
fault_cbfunc = cbfunc;
return temp_cbfunc;
}
/********************
* Local Functions
********************/

View File

@ -55,6 +55,8 @@ orte_errmgr_base_t orte_errmgr_base;
orte_errmgr_base_component_t orte_errmgr_base_selected_component;
orte_errmgr_fault_callback_t *fault_cbfunc;
/* Public module provides a wrapper around previous functions */
orte_errmgr_base_module_t orte_errmgr = {
NULL, /* init */

View File

@ -1,5 +1,8 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* All rights reserved.
*
* $COPYRIGHT$
@ -264,6 +267,7 @@ static int errmgr_base_tool_start_cmdline_listener(void)
*/
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
errmgr_cmdline_sender.epoch = ORTE_EPOCH_INVALID;
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE,
0,
@ -375,12 +379,14 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
swap_dest.jobid = errmgr_cmdline_sender.jobid;
swap_dest.vpid = errmgr_cmdline_sender.vpid;
swap_dest.epoch = errmgr_cmdline_sender.epoch;
errmgr_cmdline_sender = *sender;
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
errmgr_cmdline_sender.jobid = swap_dest.jobid;
errmgr_cmdline_sender.vpid = swap_dest.vpid;
errmgr_cmdline_sender.epoch = swap_dest.epoch;
goto cleanup;
}

View File

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -49,6 +49,7 @@
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/util/output.h"
#include "opal/util/error.h"
#include "opal/util/opal_sos.h"
@ -90,6 +91,22 @@ struct orte_errmgr_predicted_node_t {
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
/*
* Callback function that should be called when there is a fault.
*
* This callback function will be used anytime (other than during finalize) the
* runtime detects and handles a process failure. The runtime will complete all
* its stabilization before alerting the callback function. The parameter to the
* callback function will be the orte_process_name_t of the process that failed.
* It will not alert the application to failures that are not in the same job as
* the alerted process, only failures within the same jobid.
*
* @param[in] proc The names of the process that failed
*/
typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
/*
* Structure to describe a suggested remapping element for a predicted fault.
*
@ -242,42 +259,100 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
/**
* Register a callback to alert caller when ORTE is preparing to
* migrate the process to another location. This provides an
* opportunity for the process to checkpoint any required state,
* and to cleanly shutdown.
* Function to perform actions that require the rest of the ORTE layer to be up
* and running.
*
* @param[in] delay Time to delay before assuming process is stuck
* and cannot exit on its own - and thus, go
* ahead and migrate it
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecified error occured
*/
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
/*
* This function gets called just after startup is finished. It gives the errmgr
* a chance to setup anything that requires ORTE to actually be ready to go such
* as registering callbacks, posting receives, etc.
*/
typedef int (*orte_errmgr_base_module_post_startup_t)(void);
/*
* This function gets called just before shutdown begins. It gives the errmgr a
* chance to clean up anything that it did after startup, i.e. deregistering
* callbacks, cleaning up receives, etc.
*/
typedef int (*orte_errmgr_base_module_pre_shutdown_t)(void);
/**
* Function to mark a list of processes as dead and perform any internal cleanup
* necessary.
*
* @param[in] dead_procs Process list that is being marked as dead.
*
* @retval ORTE_SUCCESS The operation completed successfully.
* @retval ORTE_ERROR An unspecified error occurred.
*/
typedef int (*orte_errmgr_base_module_mark_processes_as_dead_t)(opal_pointer_array_t *dead_procs);
/**
* Set the callback function for faults.
*
* @param[in] cbfunc The callback function.
*
* @retval The previous fault callback function.
*/
typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
/**
* Receive updates about failure notifications.
*
* @param[in] sender The process who originally sent the failure notification.
* @param[in] buffer The buffer containing all the information about the failed process.
*
* @retval ORTE_SUCCESS The operation completed successfully.
* @retval ORTE_ERROR An unspecified error occurred.
*/
typedef int (*orte_errmgr_base_module_failure_notification_t)(orte_process_name_t *sender,
opal_buffer_t *buffer);
/*
* Module Structure
*/
struct orte_errmgr_base_module_2_3_0_t {
/** Initialization Function */
orte_errmgr_base_module_init_fn_t init;
orte_errmgr_base_module_init_fn_t init;
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t finalize;
orte_errmgr_base_module_finalize_fn_t finalize;
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
/** Actual process failure notification */
orte_errmgr_base_module_update_state_fn_t update_state;
orte_errmgr_base_module_update_state_fn_t update_state;
/** Predicted process/node failure notification */
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_module_ft_event_fn_t ft_event;
orte_errmgr_base_module_ft_event_fn_t ft_event;
/* Register to be warned of impending migration */
/* Register to be warned of impending migration */
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
/** Perform post-statup operations */
orte_errmgr_base_module_post_startup_t post_startup;
/** Perform pre-shutdown operations */
orte_errmgr_base_module_pre_shutdown_t pre_shutdown;
/* Mark a process as dead. */
orte_errmgr_base_module_mark_processes_as_dead_t mark_processes_as_dead;
/* Set the callback function */
orte_errmgr_base_module_set_fault_callback_t set_fault_callback;
/* Receive failure notification */
orte_errmgr_base_module_failure_notification_t failure_notification;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -57,10 +60,6 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
@ -81,6 +80,10 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_global_ft_event(int state);
int orte_errmgr_hnp_global_post_startup(void);
int orte_errmgr_hnp_global_pre_shutdown(void);
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
/* HNP Versions */
int orte_errmgr_hnp_base_global_init(void);

View File

@ -2,6 +2,9 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*