1
1

By popular demand the epoch code is now disabled by default.

To enable the epochs and the resilient orte code, use the configure flag:

--enable-resilient-orte

This will define both:

ORTE_ENABLE_EPOCH
ORTE_RESIL_ORTE

This commit was SVN r25093.
Этот коммит содержится в:
Wesley Bland 2011-08-26 22:16:14 +00:00
родитель 55a7b474dd
Коммит 4e7ff0bd5e
101 изменённых файлов: 652 добавлений и 362 удалений

Просмотреть файл

@ -693,8 +693,16 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
bool found = false;
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
"jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid, process_name->epoch, subnet_id, lid));
"jobid %d, vpid %d, "
#if ORTE_ENABLE_EPOCH
"epoch %d, "
#endif
"sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid,
#if ORTE_ENABLE_EPOCH
process_name->epoch,
#endif
subnet_id, lid));
/* find ibproc */
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
for (ib_proc = (mca_btl_openib_proc_t*)

Просмотреть файл

@ -1208,7 +1208,8 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
peer = OBJ_NEW(orte_namelist_t);
peer->name.jobid = comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid;
peer->name.vpid = comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid;
peer->name.epoch = comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch;
ORTE_EPOCH_SET(peer->name.epoch,comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch);
opal_list_append(&peers, &peer->item);
}
/* prepare send data */

Просмотреть файл

@ -702,7 +702,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN);
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
@ -730,7 +730,7 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN);
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
HOKE_TRAFFIC_MSG_REF_RETURN(item);
@ -840,7 +840,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -868,7 +868,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -902,7 +902,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -934,7 +934,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -954,7 +954,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN);
}
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
@ -962,7 +962,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN);
}
@ -1015,7 +1015,7 @@ do { \
}
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
{ \
HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1034,7 +1034,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \
\
msg_ref->matched = 0; \
msg_ref->done = 0; \
@ -1043,7 +1043,7 @@ do { \
msg_ref->active_drain = 0; \
}
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
{ \
HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1063,7 +1063,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \
}
@ -1466,7 +1466,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch;
ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch);
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
}
@ -3237,13 +3237,11 @@ static int traffic_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid,
peer_ref->proc_name.epoch);
peer_ref->proc_name.vpid);
} else {
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
ORTE_JOBID_INVALID, ORTE_VPID_INVALID,
ORTE_EPOCH_INVALID);
ORTE_JOBID_INVALID, ORTE_VPID_INVALID);
}
if( msg_type == COORD_MSG_TYPE_P_SEND ||
@ -3377,7 +3375,7 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
(*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch;
ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch);
}
return exit_status;
@ -3808,8 +3806,7 @@ static int drain_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type,
count, NULL, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid,
peer_ref->proc_name.epoch);
peer_ref->proc_name.vpid);
(*msg_ref)->done = 0;
(*msg_ref)->active = 0;
@ -5284,8 +5281,7 @@ static int send_bookmarks(int peer_idx)
*/
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = ORTE_EPOCH_INVALID;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name));
if( NULL == (peer_ref = find_peer(peer_name))) {
opal_output(mca_crcp_bkmrk_component.super.output_handle,
@ -5346,8 +5342,7 @@ static int recv_bookmarks(int peer_idx)
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = ORTE_EPOCH_INVALID;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name));
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
OMPI_CRCP_COORD_BOOKMARK_TAG,
@ -5529,7 +5524,8 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
d_msg_ack->peer.epoch = peer_ref->proc_name.epoch;
ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch);
d_msg_ack->complete = false;
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle,
@ -6169,8 +6165,7 @@ static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t *peer_r
count, datatype_size, tag, rank,
ompi_comm_lookup(comm_id),
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid,
peer_ref->proc_name.epoch);
peer_ref->proc_name.vpid);
traffic_message_create_drain_message(true, num_left_unresolved,
peer_ref,

Просмотреть файл

@ -1130,7 +1130,7 @@ static void process_cb(int fd, short event, void *data)
/* flag the identity of the remote proc */
carport.jobid = mev->sender.jobid;
carport.vpid = mev->sender.vpid;
carport.epoch = mev->sender.epoch;
ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch);
/* release the event */
OBJ_RELEASE(mev);

Просмотреть файл

@ -1,8 +1,5 @@
/*
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -398,13 +395,13 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
orte_proc.epoch = hdr->hdr_restart.hdr_epoch;
ompi_proc = ompi_proc_find(&orte_proc);
opal_output_verbose(20, mca_pml_bfo_output,
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
"hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, ompi_proc->proc_hostname=%s",
"hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
@ -413,7 +410,7 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
recvreq->remote_req_send.pval, (void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
hdr->hdr_restart.hdr_epoch, ompi_proc->proc_hostname);
ompi_proc->proc_hostname);
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
return;
}
@ -715,7 +712,6 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch;
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);

Просмотреть файл

@ -2,9 +2,6 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
@ -415,7 +412,6 @@ struct mca_pml_bfo_restart_hdr_t {
int32_t hdr_dst_rank; /**< needed to send NACK */
uint32_t hdr_jobid; /**< needed to send NACK */
uint32_t hdr_vpid; /**< needed to send NACK */
uint32_t hdr_epoch; /**< needed to send NACK */
};
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
@ -428,7 +424,6 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
(h).hdr_jobid = ntohl((h).hdr_jobid); \
(h).hdr_vpid = ntohl((h).hdr_vpid); \
(h).hdr_epoch = ntohl((h).hdr_epoch); \
} while (0)
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
@ -437,7 +432,6 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
(h).hdr_jobid = htonl((h).hdr_jobid); \
(h).hdr_vpid = htonl((h).hdr_vpid); \
(h).hdr_epoch = htonl((h).hdr_epoch); \
} while (0)
#endif /* PML_BFO */

Просмотреть файл

@ -108,7 +108,8 @@ int ompi_proc_init(void)
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.vpid = i;
proc->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN);
if (i == ORTE_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->proc_flags = OPAL_PROC_ALL_LOCAL;
@ -362,8 +363,7 @@ int ompi_proc_refresh(void) {
/* Does not change: proc->proc_name.vpid */
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.epoch = ORTE_EPOCH_INVALID;
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name));
/* Make sure to clear the local flag before we set it below */
proc->proc_flags = 0;

Просмотреть файл

@ -415,6 +415,14 @@ AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT_CR], [$opal_want_ft_cr],
AM_CONDITIONAL(WANT_FT, test "$opal_want_ft" = "1")
AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1")
#
# Compile in resilient runtime code
#
AC_ARG_ENABLE(resilient-orte,
[AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient runtime code.])])
AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"])
AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"])
#
# Do we want to install binaries?
#

Просмотреть файл

@ -81,24 +81,43 @@ typedef uint32_t orte_vpid_t;
#define ORTE_VPID_T OPAL_UINT32
#define ORTE_VPID_MAX UINT32_MAX-2
#define ORTE_VPID_MIN 0
#if ORTE_ENABLE_EPOCH
typedef uint32_t orte_epoch_t;
#define ORTE_EPOCH_T OPAL_UINT32
#define ORTE_EPOCH_MAX UINT32_MAX-2
#define ORTE_EPOCH_MIN 0
#endif
#if ORTE_ENABLE_EPOCH
#define ORTE_PROCESS_NAME_HTON(n) \
do { \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid); \
n.epoch = htonl(n.epoch); \
} while (0)
#else
#define ORTE_PROCESS_NAME_HTON(n) \
do { \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid); \
} while (0)
#endif
#if ORTE_ENABLE_EPOCH
#define ORTE_PROCESS_NAME_NTOH(n) \
do { \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid); \
n.epoch = ntohl(n.epoch); \
} while (0)
#else
#define ORTE_PROCESS_NAME_NTOH(n) \
do { \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid); \
} while (0)
#endif
#define ORTE_NAME_ARGS(n) \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
@ -127,6 +146,7 @@ do { \
struct orte_process_name_t {
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process id - equivalent to rank */
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
* The epoch will start at ORTE_EPOCH_MIN and
* increment every time the process is detected as
@ -135,6 +155,7 @@ struct orte_process_name_t {
* processes that did not directly detect the
* failure to increment their epochs.
*/
#endif
};
typedef struct orte_process_name_t orte_process_name_t;
@ -157,7 +178,10 @@ typedef void* orte_iov_base_ptr_t;
#define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an orte_process_name_t */
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
#if ORTE_ENABLE_EPOCH
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
/* State-related types */

Просмотреть файл

@ -386,7 +386,7 @@ static void recv_cmd(int status,
dat = OBJ_NEW(orte_db_data_t);
dat->name.jobid = sender->jobid;
dat->name.vpid = sender->vpid;
dat->name.epoch= sender->epoch;
ORTE_EPOCH_SET(dat->name.epoch,sender->epoch);
dat->key = key;
count=1;
opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32);

Просмотреть файл

@ -82,8 +82,10 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_set_fault_callback
orte_errmgr_base_register_migration_warning
#if ORTE_RESIL_ORTE
,orte_errmgr_base_set_fault_callback
#endif
};
/************************
@ -93,18 +95,23 @@ static int init(void)
{
int ret = ORTE_SUCCESS;
#if ORTE_RESIL_ORTE
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
#endif
return ret;
}
static int finalize(void)
{
#if ORTE_RESIL_ORTE
orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_EPOCH_CHANGE);
#endif
return ORTE_SUCCESS;
}
@ -151,6 +158,7 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
@ -209,15 +217,20 @@ void epoch_change(int fd,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else if (NULL == fault_cbfunc) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed (NULL pointer)!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
"%s errmgr:app Calling fault callback failed (num_dead <= 0)!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
#endif
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{

Просмотреть файл

@ -97,13 +97,13 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN);
}
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID);
item->proc_name.jobid = ORTE_JOBID_INVALID;
}
@ -139,13 +139,13 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN);
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->node_name = NULL;
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN);
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
item->map_node_name = NULL;
@ -156,7 +156,7 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID);
item->proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->node_name ) {
@ -165,7 +165,7 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
}
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID);
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->map_node_name ) {

Просмотреть файл

@ -267,7 +267,7 @@ static int errmgr_base_tool_start_cmdline_listener(void)
*/
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
errmgr_cmdline_sender.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN);
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE,
0,
@ -379,14 +379,14 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
swap_dest.jobid = errmgr_cmdline_sender.jobid;
swap_dest.vpid = errmgr_cmdline_sender.vpid;
swap_dest.epoch = errmgr_cmdline_sender.epoch;
ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch);
errmgr_cmdline_sender = *sender;
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
errmgr_cmdline_sender.jobid = swap_dest.jobid;
errmgr_cmdline_sender.vpid = swap_dest.vpid;
errmgr_cmdline_sender.epoch = swap_dest.epoch;
ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch);
goto cleanup;
}

Просмотреть файл

@ -53,6 +53,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -83,9 +84,11 @@ static orte_errmgr_base_module_t global_module = {
orte_errmgr_hnp_global_suggest_map_targets,
/* FT Event hook */
orte_errmgr_hnp_global_ft_event,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_migration_warning
#if ORTE_RESIL_ORTE
/* Set the callback */
orte_errmgr_base_set_fault_callback
,orte_errmgr_base_set_fault_callback
#endif
};
@ -97,14 +100,16 @@ static void failed_start(orte_job_t *jdata);
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate,
orte_proc_state_t state, orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
#if ORTE_RESIL_ORTE
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
#endif
/************************
* API Definitions
@ -380,16 +385,21 @@ cleanup:
**********************/
int orte_errmgr_hnp_base_global_init(void)
{
int ret;
int ret = ORTE_SUCCESS;
#if ORTE_RESIL_ORTE
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE,
ORTE_RML_PERSISTENT, failure_notification, NULL);
#endif
return ret;
}
int orte_errmgr_hnp_base_global_finalize(void)
{
#if ORTE_RESIL_ORTE
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE);
#endif
return ORTE_SUCCESS;
}
@ -406,6 +416,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
orte_odls_child_t *child;
int rc;
orte_app_context_t *app;
orte_proc_t *pdat;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported state %s"
@ -538,7 +549,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
exit_code);
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -550,7 +561,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
break;
case ORTE_JOB_STATE_COMM_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -562,7 +573,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
break;
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -632,10 +643,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
}
}
if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) {
exit_code = 0;
}
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
@ -679,7 +686,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
if (jdata->enable_recovery) {
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its restart limit */
@ -778,18 +785,37 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
}
} else {
#if !ORTE_RESIL_ORTE
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid), "Unknown");
} else {
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid),
(NULL == pdat->node) ? "Unknown" :
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
}
#endif
if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) {
/* The process is already dead so don't keep trying to do
* this stuff. */
return ORTE_SUCCESS;
}
#if !ORTE_RESIL_ORTE
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
#endif
/* We'll check if the job was complete when we get the
* message back from the HNP notifying us of the dead
* process */
@ -805,7 +831,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
} else {
orte_errmgr_hnp_record_dead_process(proc);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
return ORTE_ERR_UNRECOVERABLE;
@ -824,6 +850,7 @@ int orte_errmgr_hnp_base_global_ft_event(int state)
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
@ -984,6 +1011,7 @@ static void failure_notification(int status, orte_process_name_t* sender,
OBJ_RELEASE(dead_names);
}
#endif
/*****************
* Local Functions
@ -1354,7 +1382,6 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
#if 0
case ORTE_PROC_STATE_ABORTED_BY_SIG:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s aborted by signal",
@ -1370,7 +1397,6 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
#endif
case ORTE_PROC_STATE_TERM_WO_SYNC:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s terminated without sync",
@ -1393,7 +1419,6 @@ static void check_job_complete(orte_job_t *jdata)
}
break;
case ORTE_PROC_STATE_COMM_FAILED:
#if 1
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
@ -1403,7 +1428,6 @@ static void check_job_complete(orte_job_t *jdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
#endif
break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
if (!jdata->abort) {
@ -1530,9 +1554,6 @@ static void check_job_complete(orte_job_t *jdata)
*/
CHECK_DAEMONS:
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
#if 0
if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */
#endif
if (0 == orte_routed.num_routes()) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -1696,7 +1717,7 @@ CHECK_ALIVE:
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
@ -1707,7 +1728,9 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_JOBID_WILDCARD == job
&& ORTE_VPID_WILDCARD == vpid
&& ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
@ -1718,7 +1741,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
ORTE_EPOCH_SET(proc.name.epoch,epoch);
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -1913,13 +1936,15 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
}
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) &&
ORTE_PROC_STATE_TERMINATED < pdat->state) {
ORTE_PROC_STATE_TERMINATED > pdat->state) {
#if ORTE_ENABLE_EPOCH
/* Make sure that the epochs match. */
if (proc->epoch != pdat->name.epoch) {
opal_output(1, "The epoch does not match the current epoch. Throwing the request out.");
return ORTE_SUCCESS;
}
#endif
dead_names = OBJ_NEW(opal_pointer_array_t);
@ -1935,6 +1960,7 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
}
}
#if ORTE_RESIL_ORTE
if (!mca_errmgr_hnp_component.term_in_progress) {
/*
* Send a message to the other daemons so they know that a daemon has
@ -1949,7 +1975,7 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
OBJ_RELEASE(buffer);
} else {
/* Iterate of the list of dead procs and send them along with
/* Iterate over the list of dead procs and send them along with
* the rest. The HNP needs this info so it can tell the other
* ORTEDs and they can inform the appropriate applications.
*/
@ -1973,6 +1999,9 @@ int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
} else {
orte_errmgr_hnp_global_mark_processes_as_dead(dead_names);
}
#else
orte_errmgr_hnp_global_mark_processes_as_dead(dead_names);
#endif
}
return ORTE_SUCCESS;
@ -2011,6 +2040,7 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pdat->name)));
#if ORTE_RESIL_ORTE
/* Make sure the epochs match, if not it probably means that we
* already reported this failure. */
if (name_item->epoch != pdat->name.epoch) {
@ -2018,6 +2048,7 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
}
orte_util_set_epoch(name_item, name_item->epoch + 1);
#endif
/* Remove it from the job array */
opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL);
@ -2034,6 +2065,7 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
OBJ_RELEASE(pdat);
#if ORTE_RESIL_ORTE
/* Create a new proc object that will keep track of the epoch
* information */
pdat = OBJ_NEW(orte_proc_t);
@ -2041,14 +2073,15 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
pdat->name.vpid = name_item->vpid;
pdat->name.epoch = name_item->epoch + 1;
/* Set the state as terminated so we'll know the process isn't
* actually there. */
pdat->state = ORTE_PROC_STATE_TERMINATED;
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
jdat->num_procs++;
jdat->num_terminated++;
#endif
/* Set the state as terminated so we'll know the process isn't
* actually there. */
pdat->state = ORTE_PROC_STATE_TERMINATED;
} else {
#if ORTE_RESIL_ORTE
opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item));
/* Create a new proc object that will keep track of the epoch
* information */
@ -2064,11 +2097,13 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
jdat->num_procs++;
jdat->num_terminated++;
#endif
}
check_job_complete(jdat);
}
#if ORTE_RESIL_ORTE
if (!orte_orteds_term_ordered) {
/* Need to update the orted routing module. */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
@ -2077,10 +2112,12 @@ int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_pro
(*fault_cbfunc)(dead_procs);
}
}
#endif
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret = ORTE_SUCCESS;
@ -2121,3 +2158,5 @@ int send_to_local_applications(opal_pointer_array_t *dead_names) {
return ret;
}
#endif

Просмотреть файл

@ -522,7 +522,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
wp_item->name.jobid = proc->jobid;
wp_item->name.vpid = proc->vpid;
wp_item->name.epoch = proc->epoch;
ORTE_EPOCH_SET(wp_item->name.epoch,proc->epoch);
wp_item->state = state;
opal_list_append(procs_pending_recovery, &(wp_item->super));
@ -626,7 +626,7 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_MIN);
wp->state = 0;
}
@ -635,7 +635,7 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(wp->name.epoch,ORTE_EPOCH_INVALID);
wp->state = 0;
}

Просмотреть файл

@ -750,7 +750,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
@ -807,7 +807,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}
@ -855,7 +855,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(iof_name.epoch,proc->name.epoch);
}
}
}

Просмотреть файл

@ -34,6 +34,7 @@
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -41,7 +42,9 @@
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -59,13 +62,15 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int record_dead_process(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
#if ORTE_RESIL_ORTE
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata);
#endif
/*
* Module functions: Global
@ -104,8 +109,10 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
predicted_fault,
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_set_fault_callback /* Set callback function */
orte_errmgr_base_register_migration_warning
#if ORTE_RESIL_ORTE
,orte_errmgr_base_set_fault_callback /* Set callback function */
#endif
};
/************************
@ -113,16 +120,22 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
************************/
static int init(void)
{
int ret;
int ret = ORTE_SUCCESS;
#if ORTE_RESIL_ORTE
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE,
ORTE_RML_PERSISTENT, failure_notification, NULL);
#endif
return ret;
}
static int finalize(void)
{
#if ORTE_RESIL_ORTE
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_FAILURE_NOTICE);
#endif
return ORTE_SUCCESS;
}
@ -228,10 +241,10 @@ static int update_state(orte_jobid_t job,
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
break;
@ -276,7 +289,7 @@ static int update_state(orte_jobid_t job,
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* terminate - our routed children will see
* us leave and automatically die
*/
@ -290,10 +303,18 @@ static int update_state(orte_jobid_t job,
if (0 == orte_routed.num_routes() &&
0 == opal_list_get_size(&orte_local_children)) {
orte_quit();
} else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted not exiting, num_routes() == %d, num children == %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_routed.num_routes(),
opal_list_get_size(&orte_local_children)));
}
}
#if ORTE_RESIL_ORTE
record_dead_process(proc);
#endif
/* if not, then indicate we can continue */
return ORTE_SUCCESS;
@ -344,7 +365,7 @@ static int update_state(orte_jobid_t job,
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
}
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
@ -526,10 +547,12 @@ REPORT_ABORT:
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
#if ORTE_ENABLE_EPOCH
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->epoch, 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
#endif
}
}
/* pack an invalid marker */
@ -660,7 +683,7 @@ static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
continue;
}
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
if (0 < ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) {
continue;
}
@ -669,9 +692,11 @@ static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item)));
#if ORTE_ENABLE_EPOCH
/* Increment the epoch */
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
orte_util_set_epoch(name_item, name_item->epoch + 1);
#endif
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
@ -706,6 +731,7 @@ static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
return ORTE_SUCCESS;
}
#if ORTE_RESIL_ORTE
static void failure_notification(int status, orte_process_name_t* sender,
opal_buffer_t *buffer, orte_rml_tag_t tag,
void* cbdata)
@ -714,7 +740,7 @@ static void failure_notification(int status, orte_process_name_t* sender,
orte_std_cntr_t n;
int ret = ORTE_SUCCESS, num_failed;
int32_t i;
orte_process_name_t *name_item, proc;
orte_process_name_t *name_item;
dead_names = OBJ_NEW(opal_pointer_array_t);
@ -746,7 +772,7 @@ static void failure_notification(int status, orte_process_name_t* sender,
/* There shouldn't be an issue of receiving this message multiple
* times but it doesn't hurt to double check.
*/
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
if (0 < ORTE_EPOCH_CMP(name_item->epoch,orte_ess.proc_get_epoch(name_item))) {
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
continue;
}
@ -767,6 +793,7 @@ static void failure_notification(int status, orte_process_name_t* sender,
free(name_item);
}
}
#endif
/*****************
* Local Functions
@ -948,11 +975,13 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
ORTE_ERROR_LOG(rc);
return rc;
}
#if ORTE_ENABLE_EPOCH
/* Pack the child's epoch. */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
/* pack the contact info */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
@ -1015,7 +1044,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
@ -1026,7 +1055,9 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_JOBID_WILDCARD == job
&& ORTE_VPID_WILDCARD == vpid
&& 0 == ORTE_EPOCH_CMP(ORTE_EPOCH_WILDCARD,epoch)) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
@ -1037,7 +1068,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
ORTE_EPOCH_SET(proc.name.epoch,epoch);
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -1082,20 +1113,21 @@ static int record_dead_process(orte_process_name_t *proc) {
return rc;
}
#if ORTE_RESIL_ORTE
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret;
orte_process_name_t *name_item;
int size, i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending failure to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
buf = OBJ_NEW(opal_buffer_t);
size = opal_pointer_array_get_size(dead_names);
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending %d failure(s) to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size));
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
@ -1122,4 +1154,5 @@ int send_to_local_applications(opal_pointer_array_t *dead_names) {
return ORTE_SUCCESS;
}
#endif

Просмотреть файл

@ -363,8 +363,8 @@ static int alps_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_PRINT(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -57,7 +57,11 @@ ORTE_DECLSPEC extern int orte_ess_base_output;
ORTE_DECLSPEC extern opal_list_t orte_ess_base_components_available;
#if ORTE_ENABLE_EPOCH
ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc);
#else
ORTE_DECLSPEC int orte_ess_base_proc_get_epoch(orte_process_name_t *proc);
#endif
#if !ORTE_DISABLE_FULL_SUPPORT

Просмотреть файл

@ -36,21 +36,19 @@ extern opal_list_t orte_ess_base_components_available;
* Generic function to retrieve the epoch of a specific process
* from the job data.
*/
#if !ORTE_ENABLE_EPOCH
int orte_ess_base_proc_get_epoch(orte_process_name_t *proc) {
return 0;
}
#else
orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc) {
orte_epoch_t epoch = ORTE_EPOCH_INVALID;
#if !ORTE_DISABLE_FULL_SUPPORT
epoch = orte_util_lookup_epoch(proc);
#endif
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:generic: proc %s has epoch %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
epoch));
return epoch;
}
#endif
int
orte_ess_base_select(void)

3
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -392,8 +392,7 @@ static int env_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:env set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -111,7 +111,11 @@ typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_pr
* will get the most up to date version stored within the orte_proc_t struct.
* Obviously the epoch of the proc that is passed in will be ignored.
*/
#if ORTE_ENABLE_EPOCH
typedef orte_epoch_t (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc);
#else
typedef int (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc);
#endif
/**
* Update the pidmap

Просмотреть файл

@ -155,7 +155,7 @@ static int rte_init(void)
goto error;
}
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s completed name definition",
@ -273,7 +273,7 @@ static int rte_init(void)
if (vpid == ORTE_PROC_MY_NAME->vpid) {
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = i;
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s node %d name %s rank %s",
@ -304,7 +304,7 @@ static int rte_init(void)
if (vpid == ORTE_PROC_MY_NAME->vpid) {
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = i;
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s node %d name %s rank %d",

Просмотреть файл

@ -494,7 +494,7 @@ static int rte_init(void)
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
proc->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
proc->pid = orte_process_info.pid;
proc->rml_uri = orte_rml.get_contact_info();

Просмотреть файл

@ -357,8 +357,7 @@ static int lsf_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
/* fix up the base name and make it the "real" name */
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));

Просмотреть файл

@ -188,7 +188,7 @@ static int rte_init(void)
/* set the name */
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
} else {
/*

Просмотреть файл

@ -280,8 +280,7 @@ static int slave_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -368,8 +368,7 @@ static int slurm_set_name(void)
/* fix up the vpid and make it the "real" vpid */
slurm_nodeid = atoi(getenv("SLURM_NODEID"));
ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -195,7 +195,7 @@ static int rte_init(void)
}
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
#endif
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
/* get our local rank */
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
error = "could not get SLURM_LOCALID";
@ -260,7 +260,7 @@ static int rte_init(void)
nodeid = strtol(envar, NULL, 10);
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = nodeid;
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
/* get the number of ppn */
if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) {

Просмотреть файл

@ -364,7 +364,7 @@ static int tm_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:tm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -1097,11 +1097,11 @@ static int orte_filem_rsh_start_command(orte_filem_base_process_set_t *proc_set
if( NULL != proc_set ) {
wp_item->proc_set.source.jobid = proc_set->source.jobid;
wp_item->proc_set.source.vpid = proc_set->source.vpid;
wp_item->proc_set.source.epoch = proc_set->source.epoch;
ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,proc_set->source.epoch);
wp_item->proc_set.sink.jobid = proc_set->sink.jobid;
wp_item->proc_set.sink.vpid = proc_set->sink.vpid;
wp_item->proc_set.sink.epoch = proc_set->sink.epoch;
ORTE_EPOCH_SET(wp_item->proc_set.sink.epoch,proc_set->sink.epoch);
}
/* Copy the File Set */
if( NULL != file_set ) {
@ -1396,7 +1396,7 @@ static void orte_filem_rsh_permission_callback(int status,
wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t);
wp_item->proc_set.source.jobid = sender->jobid;
wp_item->proc_set.source.vpid = sender->vpid;
wp_item->proc_set.source.epoch = sender->epoch;
ORTE_EPOCH_SET(wp_item->proc_set.source.epoch,sender->epoch);
opal_list_append(&work_pool_waiting, &(wp_item->super));
}

Просмотреть файл

@ -168,8 +168,7 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
/* I send first */
peer.vpid = vpids[1];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
/* setup a temp buffer so I can inform the other side as to the
* number of entries in my buffer
@ -226,8 +225,7 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
opal_dss.copy_payload(&buf, sendbuf);
peer.vpid = vpids[0];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:coll:two-proc sending to %s",
@ -320,8 +318,7 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
/* first send my current contents */
nv = (rank - distance + np) % np;
peer.vpid = vpids[nv];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
@ -340,8 +337,7 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
num_recvd = 0;
nv = (rank + distance) % np;
peer.vpid = vpids[nv];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer,
@ -439,8 +435,7 @@ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int
/* first send my current contents */
nv = rank ^ distance;
peer.vpid = vpids[nv];
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
@ -646,8 +641,7 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
proc.jobid = jobid;
proc.vpid = 0;
while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) {
proc.epoch = ORTE_EPOCH_INVALID;
proc.epoch = orte_ess.proc_get_epoch(&proc);
ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc));
/* get the daemon that hosts this proc */
daemonvpid = orte_ess.proc_get_daemon(&proc);
@ -713,8 +707,7 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
/* send it */
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
my_parent.vpid = orte_routed.get_routing_tree(NULL);
my_parent.epoch = ORTE_EPOCH_INVALID;
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent));
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s",

Просмотреть файл

@ -95,7 +95,7 @@ static int init(void)
my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid;
my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID;
my_local_rank_zero_proc.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,ORTE_EPOCH_MIN);
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) {
ORTE_ERROR_LOG(rc);
@ -270,7 +270,7 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
proc.jobid = ORTE_PROC_MY_NAME->jobid;
for (v=0; v < orte_process_info.num_procs; v++) {
proc.vpid = v;
proc.epoch = orte_util_lookup_epoch(&proc);
ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc));
/* is this proc local_rank=0 on its node? */
if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) {
@ -285,7 +285,7 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = proc.jobid;
nm->name.vpid = proc.vpid;
nm->name.epoch = proc.epoch;
ORTE_EPOCH_SET(nm->name.epoch,proc.epoch);
opal_list_append(&my_local_peers, &nm->item);
/* if I am not local_rank=0, is this one? */
@ -293,7 +293,7 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
0 == orte_ess.get_local_rank(&proc)) {
my_local_rank_zero_proc.jobid = proc.jobid;
my_local_rank_zero_proc.vpid = proc.vpid;
my_local_rank_zero_proc.epoch = proc.epoch;
ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch);
}
}

Просмотреть файл

@ -135,7 +135,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
ep = OBJ_NEW(orte_iof_sink_t); \
ep->name.jobid = (nm)->jobid; \
ep->name.vpid = (nm)->vpid; \
ep->name.epoch = (nm)->epoch; \
ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \
ep->tag = (tg); \
if (0 <= (fid)) { \
ep->wev->fd = (fid); \
@ -169,7 +169,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
rev = OBJ_NEW(orte_iof_read_event_t); \
rev->name.jobid = (nm)->jobid; \
rev->name.vpid = (nm)->vpid; \
rev->name.epoch = (nm)->epoch; \
ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \
rev->tag = (tg); \
rev->fd = (fid); \
*(rv) = rev; \
@ -194,7 +194,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
ep = OBJ_NEW(orte_iof_sink_t); \
ep->name.jobid = (nm)->jobid; \
ep->name.vpid = (nm)->vpid; \
ep->name.epoch = (nm)->epoch; \
ORTE_EPOCH_SET(ep->name.epoch,(nm)->epoch); \
ep->tag = (tg); \
if (0 <= (fid)) { \
ep->wev->fd = (fid); \
@ -215,7 +215,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
rev = OBJ_NEW(orte_iof_read_event_t); \
rev->name.jobid = (nm)->jobid; \
rev->name.vpid = (nm)->vpid; \
rev->name.epoch= (nm)->epoch; \
ORTE_EPOCH_SET(rev->name.epoch,(nm)->epoch); \
rev->tag = (tg); \
*(rv) = rev; \
opal_event_set(opal_event_base, \

Просмотреть файл

@ -91,7 +91,7 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr)
{
ptr->daemon.jobid = ORTE_JOBID_INVALID;
ptr->daemon.vpid = ORTE_VPID_INVALID;
ptr->daemon.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ptr->daemon.epoch,ORTE_EPOCH_MIN);
ptr->wev = OBJ_NEW(orte_iof_write_event_t);
}
static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr)

Просмотреть файл

@ -186,7 +186,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
proct->name.epoch = dst_name->epoch;
ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch);
opal_list_append(&mca_iof_hnp_component.procs, &proct->super);
/* see if we are to output to a file */
if (NULL != orte_output_filename) {
@ -281,8 +281,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
&mca_iof_hnp_component.sinks);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = proc->node->daemon->name.vpid;
sink->daemon.epoch = ORTE_EPOCH_INVALID;
sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon);
ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon));
}
}
@ -389,7 +388,7 @@ static int hnp_pull(const orte_process_name_t* dst_name,
&mca_iof_hnp_component.sinks);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;
sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(sink->daemon.epoch,ORTE_PROC_MY_NAME->epoch);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -109,21 +109,21 @@ static void process_msg(int fd, short event, void *cbdata)
NULL, &mca_iof_hnp_component.sinks);
sink->daemon.jobid = mev->sender.jobid;
sink->daemon.vpid = mev->sender.vpid;
sink->daemon.epoch = mev->sender.epoch;
ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch);
}
if (ORTE_IOF_STDERR & stream) {
ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR,
NULL, &mca_iof_hnp_component.sinks);
sink->daemon.jobid = mev->sender.jobid;
sink->daemon.vpid = mev->sender.vpid;
sink->daemon.epoch = mev->sender.epoch;
ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch);
}
if (ORTE_IOF_STDDIAG & stream) {
ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG,
NULL, &mca_iof_hnp_component.sinks);
sink->daemon.jobid = mev->sender.jobid;
sink->daemon.vpid = mev->sender.vpid;
sink->daemon.epoch = mev->sender.epoch;
ORTE_EPOCH_SET(sink->daemon.epoch,mev->sender.epoch);
}
goto CLEAN_RETURN;
}

Просмотреть файл

@ -163,7 +163,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
proct->name.epoch = dst_name->epoch;
ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch);
opal_list_append(&mca_iof_orted_component.procs, &proct->super);
/* see if we are to output to a file */
if (NULL != orte_output_filename) {

Просмотреть файл

@ -734,8 +734,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
proc.jobid = jobdat->jobid;
for (j=0; j < jobdat->num_procs; j++) {
proc.vpid = j;
proc.epoch = ORTE_EPOCH_INVALID;
proc.epoch = orte_ess.proc_get_epoch(&proc);
ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc));
/* get the vpid of the daemon that is to host this proc */
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
@ -1044,6 +1043,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
free(param);
free(value);
#if ORTE_ENABLE_EPOCH
/* setup the epoch */
if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, child->name->epoch))) {
ORTE_ERROR_LOG(rc);
@ -1057,6 +1057,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
opal_setenv(param, value, true, env);
free(param);
free(value);
#endif
/* setup the vpid */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) {
@ -2721,7 +2722,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
OBJ_CONSTRUCT(&proctmp, orte_proc_t);
proctmp.name.jobid = ORTE_JOBID_WILDCARD;
proctmp.name.vpid = ORTE_VPID_WILDCARD;
proctmp.name.epoch = ORTE_EPOCH_WILDCARD;
ORTE_EPOCH_SET(proctmp.name.epoch,ORTE_EPOCH_WILDCARD);
opal_pointer_array_add(&procarray, &proctmp);
procptr = &procarray;
do_cleanup = true;

Просмотреть файл

@ -187,7 +187,7 @@ int orte_odls_base_open(void)
if (-1 == rank) {
/* wildcard */
nm->name.vpid = ORTE_VPID_WILDCARD;
nm->name.epoch = ORTE_EPOCH_WILDCARD;
ORTE_EPOCH_SET(nm->name.epoch,ORTE_EPOCH_WILDCARD);
} else if (rank < 0) {
/* error out on bozo case */
orte_show_help("help-odls-base.txt",
@ -200,8 +200,7 @@ int orte_odls_base_open(void)
* will be in the job - we'll check later
*/
nm->name.vpid = rank;
nm->name.epoch = ORTE_EPOCH_INVALID;
nm->name.epoch = orte_ess.proc_get_epoch(&nm->name);
ORTE_EPOCH_SET(nm->name.epoch,orte_ess.proc_get_epoch(&nm->name));
}
opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item);
}

Просмотреть файл

@ -77,17 +77,17 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
/* if I am the HNP, then use me as the source */
p_set->source.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->source.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->source.epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch);
}
else {
/* otherwise, set the HNP as the source */
p_set->source.jobid = ORTE_PROC_MY_HNP->jobid;
p_set->source.vpid = ORTE_PROC_MY_HNP->vpid;
p_set->source.epoch = ORTE_PROC_MY_HNP->epoch;
ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch);
}
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch);
opal_list_append(&(filem_request->process_sets), &(p_set->super) );

Просмотреть файл

@ -137,6 +137,7 @@ int mca_oob_tcp_msg_complete(mca_oob_tcp_msg_t* msg, orte_process_name_t * peer)
bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_peer_t * peer)
{
int rc;
while(1) {
rc = writev(peer->peer_sd, msg->msg_rwptr, msg->msg_rwnum);
if(rc < 0) {
@ -338,6 +339,7 @@ static void mca_oob_tcp_msg_ident(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pe
orte_process_name_t src = msg->msg_hdr.msg_src;
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, &src) != OPAL_EQUAL) {
opal_hash_table_remove_value_uint64(&mca_oob_tcp_component.tcp_peers,
orte_util_hash_name(&peer->peer_name));

Просмотреть файл

@ -903,6 +903,11 @@ int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer)
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
{
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user;
if (orte_abnormal_term_ordered) {
return;
}
OPAL_THREAD_LOCK(&peer->peer_lock);
switch(peer->peer_state) {
case MCA_OOB_TCP_CONNECT_ACK:

Просмотреть файл

@ -62,12 +62,12 @@ int orte_plm_base_set_hnp_name(void)
/* set the name */
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0;
ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
/* copy it to the HNP field */
ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_PROC_MY_NAME->epoch);
/* done */
return ORTE_SUCCESS;

Просмотреть файл

@ -377,8 +377,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
/* push stdin - the IOF will know what to do with the specified target */
name.jobid = job;
name.vpid = jdata->stdin_target;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name));
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -163,8 +163,7 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
continue;
}
peer.vpid = v;
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
/* don't worry about errors on the send here - just
* issue it and keep going
@ -242,7 +241,7 @@ int orte_plm_base_orted_terminate_job(orte_jobid_t jobid)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = jobid;
proc.name.vpid = ORTE_VPID_WILDCARD;
proc.name.epoch = ORTE_EPOCH_WILDCARD;
ORTE_EPOCH_SET(proc.name.epoch,ORTE_EPOCH_WILDCARD);
opal_pointer_array_add(&procs, &proc);
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) {
ORTE_ERROR_LOG(rc);
@ -340,8 +339,7 @@ int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs)
continue;
}
peer.vpid = v;
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
/* check to see if this daemon is known to be "dead" */
if (proc->state > ORTE_PROC_STATE_UNTERMINATED) {
/* don't try to send this */

Просмотреть файл

@ -146,7 +146,9 @@ static void process_msg(int fd, short event, void *data)
orte_job_t *jdata, *parent;
opal_buffer_t answer;
orte_vpid_t vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch;
#endif
orte_proc_t *proc;
orte_proc_state_t state;
orte_exit_code_t exit_code;
@ -394,8 +396,7 @@ static void process_msg(int fd, short event, void *data)
break;
}
name.vpid = vpid;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name));
/* unpack the pid */
count = 1;
@ -488,9 +489,11 @@ static void process_msg(int fd, short event, void *data)
}
name.vpid = vpid;
#if ORTE_ENABLE_EPOCH
count=1;
opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH);
name.epoch = epoch;
#endif
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive Described rank %s",

Просмотреть файл

@ -1527,7 +1527,9 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
{
char *param, *path, *tmp, *cmd, *basename, *dest_dir;
int i;
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch;
#endif
orte_process_name_t proc;
/* if a prefix is set, pass it to the bootproxy in a special way */
@ -1638,6 +1640,7 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv);
free(cmd);
#if ORTE_ENABLE_EPOCH
/* set the epoch */
proc.jobid = jobid;
proc.vpid = vpid;
@ -1648,6 +1651,7 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
opal_setenv(param, cmd, true, argv);
free(param);
free(cmd);
#endif
/* set the number of procs */
asprintf(&cmd, "%d", (int)num_procs);

Просмотреть файл

@ -33,12 +33,14 @@
#include "orte/mca/ess/ess.h"
#include "opal/mca/sysinfo/sysinfo_types.h"
#include "orte/types.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
@ -454,7 +456,7 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
*/
/* We do set the epoch here since they all start with the same value. */
proc->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
proc->app_idx = app_idx;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
@ -559,11 +561,12 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
}
}
proc->name.vpid = vpid;
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (ORTE_NODE_RANK_INVALID == proc->name.epoch) {
proc->name.epoch = ORTE_EPOCH_MIN;
if (0 == ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc->name.epoch)) {
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
}
}
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
@ -601,8 +604,8 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
}
}
proc->name.vpid = vpid;
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
}
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
@ -835,7 +838,7 @@ int orte_rmaps_base_define_daemons(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.vpid = daemons->num_procs; /* take the next available vpid */
proc->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
proc->node = node;
proc->nodename = node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
@ -1014,8 +1017,8 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.vpid = jdata->num_procs; /* take the next available vpid */
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
proc->node = node;
proc->nodename = node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,

Просмотреть файл

@ -502,8 +502,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
}
proc->name.vpid = rank;
/* Either init or update the epoch. */
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
proc->slot_list = strdup(rfmap->slot_list);
/* insert the proc into the proper place */

Просмотреть файл

@ -235,8 +235,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
}
/* assign the vpid */
proc->name.vpid = vpid++;
proc->name.epoch = ORTE_EPOCH_INVALID;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
/* add to the jdata proc array */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {

Просмотреть файл

@ -341,7 +341,7 @@ static void recv_construct(rmcast_base_recv_t *ptr)
{
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN);
ptr->channel = ORTE_RMCAST_INVALID_CHANNEL;
OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t);
ptr->seq_num = ORTE_RMCAST_SEQ_INVALID;
@ -430,7 +430,7 @@ static void recvlog_construct(rmcast_recv_log_t *ptr)
{
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN);
OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t);
}
static void recvlog_destruct(rmcast_recv_log_t *ptr)
@ -439,7 +439,7 @@ static void recvlog_destruct(rmcast_recv_log_t *ptr)
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_INVALID);
while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) {
OBJ_RELEASE(item);
}

Просмотреть файл

@ -681,7 +681,7 @@ static int tcp_recv(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
}
*seq_num = recvptr->seq_num;
*msg = recvptr->iovec_array;
@ -776,7 +776,7 @@ static int tcp_recv_buffer(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
}
*seq_num = recvptr->seq_num;
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {

Просмотреть файл

@ -460,7 +460,7 @@ static int udp_recv(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
}
*seq_num = recvptr->seq_num;
*msg = recvptr->iovec_array;
@ -553,7 +553,7 @@ static int udp_recv_buffer(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
}
*seq_num = recvptr->seq_num;
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {

Просмотреть файл

@ -20,6 +20,7 @@
#include "opal/util/output.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#if !ORTE_DISABLE_FULL_SUPPORT
@ -67,14 +68,14 @@ static void msg_pkt_constructor(orte_msg_packet_t *pkt)
{
pkt->sender.jobid = ORTE_JOBID_INVALID;
pkt->sender.vpid = ORTE_VPID_INVALID;
pkt->sender.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_MIN);
pkt->buffer = NULL;
}
static void msg_pkt_destructor(orte_msg_packet_t *pkt)
{
pkt->sender.jobid = ORTE_JOBID_INVALID;
pkt->sender.vpid = ORTE_VPID_INVALID;
pkt->sender.epoch = ORTE_EPOCH_INVALID;
ORTE_EPOCH_SET(pkt->sender.epoch,ORTE_EPOCH_INVALID);
if (NULL != pkt->buffer) {
OBJ_RELEASE(pkt->buffer);
}

Просмотреть файл

@ -62,7 +62,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
pkt = OBJ_NEW(orte_msg_packet_t); \
pkt->sender.jobid = (sndr)->jobid; \
pkt->sender.vpid = (sndr)->vpid; \
pkt->sender.epoch = (sndr)->epoch; \
ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \
if ((crt)) { \
pkt->buffer = OBJ_NEW(opal_buffer_t); \
opal_dss.copy_payload(pkt->buffer, *(buf)); \
@ -85,7 +85,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
pkt = OBJ_NEW(orte_msg_packet_t); \
pkt->sender.jobid = (sndr)->jobid; \
pkt->sender.vpid = (sndr)->vpid; \
pkt->sender.epoch = (sndr)->epoch; \
ORTE_EPOCH_SET(pkt->sender.epoch,(sndr)->epoch); \
if ((crt)) { \
pkt->buffer = OBJ_NEW(opal_buffer_t); \
opal_dss.copy_payload(pkt->buffer, *(buf)); \
@ -191,8 +191,10 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
#define ORTE_RML_TAG_SUBSCRIBE 46
#if ORTE_ENABLE_EPOCH
/* For Epoch Updates */
#define ORTE_RML_TAG_EPOCH_CHANGE 47
#endif
/* Notify of failed processes */
#define ORTE_RML_TAG_FAILURE_NOTICE 48

Просмотреть файл

@ -65,7 +65,7 @@ static void jfamconst(orte_routed_jobfam_t *ptr)
{
ptr->route.jobid = ORTE_JOBID_INVALID;
ptr->route.vpid = ORTE_VPID_INVALID;
ptr->route.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ptr->route.epoch,ORTE_EPOCH_MIN);
ptr->hnp_uri = NULL;
}
static void jfamdest(orte_routed_jobfam_t *ptr)
@ -117,7 +117,7 @@ orte_routed_base_open(void)
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->route.jobid = ORTE_PROC_MY_HNP->jobid;
jfam->route.vpid = ORTE_PROC_MY_HNP->vpid;
jfam->route.epoch = ORTE_PROC_MY_HNP->epoch;
ORTE_EPOCH_SET(jfam->route.epoch,ORTE_PROC_MY_HNP->epoch);
jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
if (NULL != orte_process_info.my_hnp_uri) {
jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri);
@ -252,7 +252,7 @@ void orte_routed_base_update_hnps(opal_buffer_t *buf)
jfam->job_family = jobfamily;
jfam->route.jobid = name.jobid;
jfam->route.vpid = name.vpid;
jfam->route.epoch = name.epoch;
ORTE_EPOCH_SET(jfam->route.epoch,name.epoch);
jfam->hnp_uri = strdup(uri);
done:
free(uri);

Просмотреть файл

@ -127,7 +127,9 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
orte_std_cntr_t cnt;
char *rml_uri;
orte_vpid_t vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch;
#endif
int rc;
if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
@ -146,11 +148,13 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
cnt = 1;
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) {
#if ORTE_ENABLE_EPOCH
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
continue;
}
#endif
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -33,6 +33,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -147,7 +148,7 @@ static int delete_route(orte_process_name_t *proc)
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -216,7 +217,7 @@ static int update_route(orte_process_name_t *target,
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -274,8 +275,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route));
return ORTE_SUCCESS;
}
@ -290,8 +290,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route));
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
@ -317,11 +316,21 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* initialize */
daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
daemon.epoch = ORTE_PROC_MY_DAEMON->epoch;
ORTE_EPOCH_SET(daemon.epoch,ORTE_PROC_MY_DAEMON->epoch);
#if ORTE_ENABLE_EPOCH
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
#else
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
#endif
ret = ORTE_NAME_INVALID;
goto found;
}
if (0 > ORTE_EPOCH_CMP(target->epoch, orte_ess.proc_get_epoch(target))) {
ret = ORTE_NAME_INVALID;
goto found;
}
@ -443,7 +452,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* If the daemon to which we should be routing is dead, then update
* the routing tree and start over. */
if (!orte_util_proc_is_running(&daemon)) {
if (!PROC_IS_RUNNING(&daemon)) {
update_routing_tree(daemon.jobid);
goto startover;
}
@ -461,8 +470,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
ret = &daemon;
found:
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon));
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_binomial_get(%s) --> %s",
@ -879,7 +887,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = proc->epoch;
ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch);
lifeline = &local_lifeline;
return ORTE_SUCCESS;
@ -924,11 +932,11 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
* that process so we can check it's state.
*/
proc_name.vpid = peer;
proc_name.epoch = orte_util_lookup_epoch(&proc_name);
ORTE_EPOCH_SET(proc_name.epoch,orte_util_lookup_epoch(&proc_name));
if (!orte_util_proc_is_running(&proc_name)
&& ORTE_EPOCH_MIN < proc_name.epoch
&& ORTE_EPOCH_INVALID != proc_name.epoch) {
if (!PROC_IS_RUNNING(&proc_name)
&& 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,proc_name.epoch)
&& 0 != ORTE_EPOCH_CMP(ORTE_EPOCH_INVALID,proc_name.epoch)) {
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
"%s routed:binomial child %s is dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -967,7 +975,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
}
/* find the children of this rank */
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:binomial find children of rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank));
bitmap = opal_cube_dim(num_procs);
@ -977,24 +985,25 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
peer = rank | mask;
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:binomial find children checking peer %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer));
if (peer < num_procs) {
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:binomial find children computing tree",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* execute compute on this child */
if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine, jobid))) {
proc_name.vpid = found;
if (!orte_util_proc_is_running(&proc_name) && ORTE_EPOCH_MIN < orte_util_lookup_epoch(&proc_name)) {
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
if (!PROC_IS_RUNNING(&proc_name)
&& 0 < ORTE_EPOCH_CMP(ORTE_EPOCH_MIN,orte_util_lookup_epoch(&proc_name))) {
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:binomial find children proc out of date - returning parent %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), parent));
return parent;
}
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
"%s routed:binomial find children returning found value %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found));
return found;
@ -1029,8 +1038,7 @@ static int update_routing_tree(orte_jobid_t jobid)
ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
orte_process_info.max_procs,
&num_children, &my_children, NULL, true, jobid);
ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_PARENT->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT);
ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT));
if (0 < opal_output_get_verbosity(orte_routed_base_output)) {
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children);

Просмотреть файл

@ -35,6 +35,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -139,7 +140,7 @@ static int delete_route(orte_process_name_t *proc)
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -200,7 +201,7 @@ static int update_route(orte_process_name_t *target,
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -257,8 +258,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route));
return ORTE_SUCCESS;
}
@ -273,8 +273,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = ORTE_EPOCH_INVALID;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
ORTE_EPOCH_SET(jfam->route.epoch,orte_ess.proc_get_epoch(&jfam->route));
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
@ -299,7 +298,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
ret = ORTE_NAME_INVALID;
goto found;
}
@ -367,8 +366,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/* Initialize daemon's epoch, based on its current vpid/jobid */
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon));
/* if the daemon is me, then send direct to the target! */
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
@ -814,8 +812,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = ORTE_EPOCH_INVALID;
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline));
lifeline = &local_lifeline;

Просмотреть файл

@ -24,6 +24,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -135,7 +136,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
ret = ORTE_NAME_INVALID;
} else {
/* all routes are direct */

Просмотреть файл

@ -31,6 +31,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -132,7 +133,7 @@ static int delete_route(orte_process_name_t *proc)
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -201,7 +202,7 @@ static int update_route(orte_process_name_t *target,
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -259,7 +260,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
ORTE_EPOCH_SET(jfam->route.epoch,route->epoch);
return ORTE_SUCCESS;
}
}
@ -273,7 +274,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
ORTE_EPOCH_SET(jfam->route.epoch,route->epoch);
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
@ -373,8 +374,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/* Initialize daemon's epoch, based on its current vpid/jobid */
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon));
/* if the daemon is me, then send direct to the target! */
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
@ -395,8 +395,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* we are at end of chain - wrap around */
daemon.vpid = 0;
}
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon));
ret = &daemon;
}
}
@ -741,7 +740,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = proc->epoch;
ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch);
lifeline = &local_lifeline;
return ORTE_SUCCESS;

Просмотреть файл

@ -31,6 +31,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -145,7 +146,7 @@ static int delete_route(orte_process_name_t *proc)
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(proc->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -214,7 +215,7 @@ static int update_route(orte_process_name_t *target,
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
return ORTE_ERR_BAD_PARAM;
}
@ -272,7 +273,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
ORTE_EPOCH_SET(jfam->route.epoch,route->epoch);
return ORTE_SUCCESS;
}
}
@ -286,7 +287,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
ORTE_EPOCH_SET(jfam->route.epoch,route->epoch);
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
@ -310,7 +311,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
ret = ORTE_NAME_INVALID;
goto found;
}
@ -413,8 +414,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
/* yep - we need to step through this child */
daemon.vpid = child->vpid;
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon));
ret = &daemon;
goto found;
}
@ -425,8 +425,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
* any of our children, so we have to step up through our parent
*/
daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
daemon.epoch = ORTE_EPOCH_INVALID;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ORTE_EPOCH_SET(daemon.epoch,orte_ess.proc_get_epoch(&daemon));
ret = &daemon;
@ -788,7 +787,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = proc->epoch;
ORTE_EPOCH_SET(local_lifeline.epoch,proc->epoch);
lifeline = &local_lifeline;
return ORTE_SUCCESS;
@ -881,8 +880,7 @@ static int update_routing_tree(orte_jobid_t jobid)
ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel;
ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel);
}
ORTE_PROC_MY_PARENT->epoch = ORTE_EPOCH_INVALID;
ORTE_PROC_MY_PARENT->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT);
ORTE_EPOCH_SET(ORTE_PROC_MY_PARENT->epoch,orte_ess.proc_get_epoch(ORTE_PROC_MY_PARENT));
/* compute my direct children and the bitmap that shows which vpids
* lie underneath their branch

Просмотреть файл

@ -26,6 +26,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -134,7 +135,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
0 == ORTE_EPOCH_CMP(target->epoch,ORTE_EPOCH_INVALID)) {
ret = ORTE_NAME_INVALID;
} else {
/* a slave must always route via its parent daemon */
@ -275,8 +276,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = ORTE_EPOCH_INVALID;
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
ORTE_EPOCH_SET(local_lifeline.epoch,orte_ess.proc_get_epoch(&local_lifeline));
lifeline = &local_lifeline;

Просмотреть файл

@ -70,7 +70,9 @@ typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch;
#endif
char *file;
int tick;
bool check_size;

Просмотреть файл

@ -81,7 +81,7 @@ void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
@ -92,7 +92,7 @@ void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;

Просмотреть файл

@ -427,7 +427,7 @@ int global_coord_start_ckpt(orte_snapc_base_quiesce_t *datum)
new_proc = OBJ_NEW(orte_proc_t);
new_proc->name.jobid = proc->name.jobid;
new_proc->name.vpid = proc->name.vpid;
new_proc->name.epoch = proc->name.epoch;
ORTE_EPOCH_SET(new_proc->name.epoch,proc->name.epoch);
new_proc->node = OBJ_NEW(orte_node_t);
new_proc->node->name = proc->node->name;
opal_list_append(migrating_procs, &new_proc->super);
@ -618,7 +618,7 @@ static int global_init_job_structs(void)
orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid;
orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid;
orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch;
ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch);
mask = ORTE_NS_CMP_JOBID;
@ -636,7 +636,7 @@ static int global_init_job_structs(void)
app_snapshot->process_name.jobid = procs[p]->name.jobid;
app_snapshot->process_name.vpid = procs[p]->name.vpid;
app_snapshot->process_name.epoch = procs[p]->name.epoch;
ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch);
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
}
@ -800,7 +800,7 @@ static int global_refresh_job_structs(void)
app_snapshot->process_name.jobid = procs[p]->name.jobid;
app_snapshot->process_name.vpid = procs[p]->name.vpid;
app_snapshot->process_name.epoch = procs[p]->name.epoch;
ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch);
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
}
@ -816,7 +816,7 @@ static int global_refresh_job_structs(void)
orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid;
orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid;
orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch;
ORTE_EPOCH_SET(orted_snapshot->process_name.epoch,cur_node->daemon->name.epoch);
mask = ORTE_NS_CMP_ALL;
@ -837,7 +837,7 @@ static int global_refresh_job_structs(void)
app_snapshot->process_name.jobid = procs[p]->name.jobid;
app_snapshot->process_name.vpid = procs[p]->name.vpid;
app_snapshot->process_name.epoch = procs[p]->name.epoch;
ORTE_EPOCH_SET(app_snapshot->process_name.epoch,procs[p]->name.epoch);
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
}

Просмотреть файл

@ -2033,7 +2033,7 @@ static int snapc_full_local_get_vpids(void)
vpid_snapshot->process_pid = child->pid;
vpid_snapshot->super.process_name.jobid = child->name->jobid;
vpid_snapshot->super.process_name.vpid = child->name->vpid;
vpid_snapshot->super.process_name.epoch = child->name->epoch;
ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch);
}
}
@ -2095,7 +2095,7 @@ static int snapc_full_local_refresh_vpids(void)
vpid_snapshot->process_pid = child->pid;
vpid_snapshot->super.process_name.jobid = child->name->jobid;
vpid_snapshot->super.process_name.vpid = child->name->vpid;
vpid_snapshot->super.process_name.epoch = child->name->epoch;
ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch);
/*vpid_snapshot->migrating = true;*/
opal_list_append(&(local_global_snapshot.local_snapshots), &(vpid_snapshot->super.super));
@ -2111,7 +2111,7 @@ static int snapc_full_local_refresh_vpids(void)
vpid_snapshot->process_pid = child->pid;
vpid_snapshot->super.process_name.jobid = child->name->jobid;
vpid_snapshot->super.process_name.vpid = child->name->vpid;
vpid_snapshot->super.process_name.epoch = child->name->epoch;
ORTE_EPOCH_SET(vpid_snapshot->super.process_name.epoch,child->name->epoch);
}
}

Просмотреть файл

@ -83,7 +83,7 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = 0;
ORTE_EPOCH_SET(snapshot->process_name.epoch,0);
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
}
@ -91,7 +91,7 @@ void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot)
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = 0;
ORTE_EPOCH_SET(snapshot->process_name.epoch,0);
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
}

Просмотреть файл

@ -62,7 +62,7 @@ void orte_sstore_base_local_snapshot_info_construct(orte_sstore_base_local_snaps
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);
snapshot->crs_comp = NULL;
snapshot->compress_comp = NULL;
@ -76,7 +76,7 @@ void orte_sstore_base_local_snapshot_info_destruct( orte_sstore_base_local_snaps
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(snapshot->process_name.epoch,ORTE_EPOCH_MIN);
if( NULL != snapshot->crs_comp ) {
free(snapshot->crs_comp);
@ -637,7 +637,7 @@ int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_in
vpid_snapshot->process_name.jobid = proc.jobid;
vpid_snapshot->process_name.vpid = proc.vpid;
vpid_snapshot->process_name.epoch = proc.epoch;
ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch);
}
else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) {
vpid_snapshot->crs_comp = strdup(value);

Просмотреть файл

@ -1216,8 +1216,7 @@ static int orte_sstore_central_extract_global_metadata(orte_sstore_central_globa
vpid_snapshot->process_name.jobid = handle_info->jobid;
vpid_snapshot->process_name.vpid = i;
vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID;
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name));
vpid_snapshot->crs_comp = NULL;
global_snapshot->start_time = NULL;

Просмотреть файл

@ -210,7 +210,7 @@ void orte_sstore_central_local_app_snapshot_info_construct(orte_sstore_central_l
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN);
info->local_location = NULL;
info->metadata_filename = NULL;
@ -222,7 +222,7 @@ void orte_sstore_central_local_app_snapshot_info_destruct( orte_sstore_central_l
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN);
if( NULL != info->local_location ) {
free(info->local_location);
@ -535,7 +535,7 @@ static int append_new_app_handle_info(orte_sstore_central_local_snapshot_info_t
app_info->name.jobid = name->jobid;
app_info->name.vpid = name->vpid;
app_info->name.epoch = name->epoch;
ORTE_EPOCH_SET(app_info->name.epoch,name->epoch);
opal_list_append(handle_info->app_info_handle, &(app_info->super));

Просмотреть файл

@ -1218,10 +1218,10 @@ static int process_local_push(orte_process_name_t* peer, opal_buffer_t* buffer,
p_set = OBJ_NEW(orte_filem_base_process_set_t);
p_set->source.jobid = peer->jobid;
p_set->source.vpid = peer->vpid;
p_set->source.epoch = peer->epoch;
ORTE_EPOCH_SET(p_set->source.epoch,peer->epoch);
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch);
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
}
@ -1706,8 +1706,7 @@ static int orte_sstore_stage_extract_global_metadata(orte_sstore_stage_global_sn
vpid_snapshot->process_name.jobid = handle_info->jobid;
vpid_snapshot->process_name.vpid = i;
vpid_snapshot->process_name.epoch = ORTE_EPOCH_INVALID;
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,orte_ess.proc_get_epoch(&vpid_snapshot->process_name));
/* JJH: Currently we do not have this information since we do not save
* individual vpid info in the Global SStore. It is in the metadata

Просмотреть файл

@ -287,7 +287,7 @@ void orte_sstore_stage_local_app_snapshot_info_construct(orte_sstore_stage_local
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN);
info->local_location = NULL;
info->compressed_local_location = NULL;
@ -302,7 +302,7 @@ void orte_sstore_stage_local_app_snapshot_info_destruct( orte_sstore_stage_local
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(info->name.epoch,ORTE_EPOCH_MIN);
if( NULL != info->local_location ) {
free(info->local_location);
@ -1014,7 +1014,7 @@ static int append_new_app_handle_info(orte_sstore_stage_local_snapshot_info_t *h
app_info->name.jobid = name->jobid;
app_info->name.vpid = name->vpid;
app_info->name.epoch = name->epoch;
ORTE_EPOCH_SET(app_info->name.epoch,name->epoch);
opal_list_append(handle_info->app_info_handle, &(app_info->super));
@ -2057,17 +2057,17 @@ static int orte_sstore_stage_local_preload_files(char **local_location, bool *sk
/* if I am the HNP, then use me as the source */
p_set->source.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->source.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->source.epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_NAME->epoch);
}
else {
/* otherwise, set the HNP as the source */
p_set->source.jobid = ORTE_PROC_MY_HNP->jobid;
p_set->source.vpid = ORTE_PROC_MY_HNP->vpid;
p_set->source.epoch = ORTE_PROC_MY_HNP->epoch;
ORTE_EPOCH_SET(p_set->source.epoch,ORTE_PROC_MY_HNP->epoch);
}
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
ORTE_EPOCH_SET(p_set->sink.epoch,ORTE_PROC_MY_NAME->epoch);
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
/* Define the file set */

Просмотреть файл

@ -123,18 +123,13 @@ static void send_relay(opal_buffer_t *buf)
nm = (orte_routed_tree_t*)item;
target.vpid = nm->vpid;
target.epoch = orte_util_lookup_epoch(&target);
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
if (!orte_util_proc_is_running(&target)) {
if (!PROC_IS_RUNNING(&target)) {
continue;
}
target.epoch = ORTE_EPOCH_INVALID;
if (ORTE_NODE_RANK_INVALID == (target.epoch = orte_ess.proc_get_epoch(&target))) {
/* If we are trying to send to a previously failed process it's
* better to fail silently. */
continue;
}
ORTE_EPOCH_SET(target.epoch,orte_ess.proc_get_epoch(&target));
OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
"%s orte:daemon:send_relay sending relay msg to %s",
@ -422,7 +417,8 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
proct = OBJ_NEW(orte_proc_t);
proct->name.jobid = proc.jobid;
proct->name.vpid = proc.vpid;
proct->name.epoch = proc.epoch;
ORTE_EPOCH_SET(proct->name.epoch,proc.epoch);
opal_pointer_array_add(&procarray, proct);
num_replies++;
}
@ -1059,7 +1055,9 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
orte_job_t *jdata;
orte_proc_t *proc;
orte_vpid_t vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch;
#endif
int32_t i, num_procs;
/* setup the answer */
@ -1086,12 +1084,14 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
goto CLEANUP;
}
#if ORTE_ENABLE_EPOCH
/* unpack the epoch */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &epoch, &n, ORTE_EPOCH))) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
#endif
/* if they asked for a specific proc, then just get that info */
if (ORTE_VPID_WILDCARD != vpid) {
@ -1201,7 +1201,7 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
/* loop across all daemons */
proc2.jobid = ORTE_PROC_MY_NAME->jobid;
for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) {
proc2.epoch = orte_util_lookup_epoch(&proc2);
ORTE_EPOCH_SET(proc2.epoch,orte_util_lookup_epoch(&proc2));
/* setup the cmd */
relay_msg = OBJ_NEW(opal_buffer_t);

Просмотреть файл

@ -388,14 +388,14 @@ int orte_daemon(int argc, char *argv[])
orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
ORTE_PROC_MY_DAEMON->epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_EPOCH_MIN);
/* if I am also the hnp, then update that contact info field too */
if (ORTE_PROC_IS_HNP) {
orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
ORTE_PROC_MY_HNP->epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ORTE_PROC_MY_HNP->epoch,ORTE_EPOCH_MIN);
}
/* setup the primary daemon command receive function */
@ -495,7 +495,8 @@ int orte_daemon(int argc, char *argv[])
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = jdata->jobid;
proc->name.vpid = 0;
proc->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
proc->state = ORTE_PROC_STATE_RUNNING;
proc->app_idx = 0;
proc->node = nodes[0]; /* hnp node must be there */

Просмотреть файл

@ -76,6 +76,7 @@ int orte_dt_compare_name(orte_process_name_t *value1,
}
}
#if ORTE_ENABLE_EPOCH
/** check the epochs - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
@ -87,6 +88,7 @@ int orte_dt_compare_name(orte_process_name_t *value1,
return OPAL_VALUE1_GREATER;
}
}
#endif
/** only way to get here is if all fields are equal or WILDCARD */
return OPAL_EQUAL;
@ -122,6 +124,7 @@ int orte_dt_compare_jobid(orte_jobid_t *value1,
return OPAL_EQUAL;
}
#if ORTE_ENABLE_EPOCH
int orte_dt_compare_epoch(orte_epoch_t *value1,
orte_epoch_t *value2,
opal_data_type_t type)
@ -136,6 +139,7 @@ int orte_dt_compare_epoch(orte_epoch_t *value1,
return OPAL_EQUAL;
}
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
/**

Просмотреть файл

@ -61,7 +61,7 @@ int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, opal
val->jobid = src->jobid;
val->vpid = src->vpid;
val->epoch = src->epoch;
ORTE_EPOCH_SET(val->epoch,src->epoch);
*dest = val;
return ORTE_SUCCESS;
@ -105,6 +105,7 @@ int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t typ
return ORTE_SUCCESS;
}
#if ORTE_ENABLE_EPOCH
/*
* EPOCH
*/
@ -123,6 +124,7 @@ int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, opal_data_type_t
return ORTE_SUCCESS;
}
#endif
#if !ORTE_DISABLE_FULL_SUPPORT

Просмотреть файл

@ -58,7 +58,9 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src,
orte_process_name_t* proc;
orte_jobid_t *jobid;
orte_vpid_t *vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t *epoch;
#endif
/* collect all the jobids in a contiguous array */
jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t));
@ -100,6 +102,7 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src,
}
free(vpid);
#if ORTE_ENABLE_EPOCH
/* Collect all the epochs in a contiguous array */
epoch = (orte_epoch_t *) malloc(num_vals * sizeof(orte_epoch_t));
if (NULL == epoch) {
@ -118,6 +121,7 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src,
return rc;
}
free(epoch);
#endif
return ORTE_SUCCESS;
}
@ -156,6 +160,7 @@ int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src,
return ret;
}
#if ORTE_ENABLE_EPOCH
/*
* EPOCH
*/
@ -171,6 +176,7 @@ int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src,
return ret;
}
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
/*

Просмотреть файл

@ -125,8 +125,10 @@ int orte_dt_std_print(char **output, char *prefix, void *src, opal_data_type_t t
orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, ORTE_STD_CNTR_T);
break;
#if ORTE_ENABLE_EPOCH
case ORTE_EPOCH:
orte_dt_quick_print(output, "ORTE_EPOCH", prefix, src, ORTE_EPOCH_T);
#endif
case ORTE_VPID:
orte_dt_quick_print(output, "ORTE_VPID", prefix, src, ORTE_VPID_T);
@ -478,11 +480,21 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
if (orte_xml_output) {
/* need to create the output in XML format */
if (0 == src->pid) {
#if ORTE_ENABLE_EPOCH
asprintf(output, "%s<process rank=\"%s\" status=\"%s\" epoch=\"%s\"/>\n", pfx2,
ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch));
#else
asprintf(output, "%s<process rank=\"%s\" status=\"%s\"/>\n", pfx2,
ORTE_VPID_PRINT(src->name.vpid), orte_proc_state_to_str(src->state));
#endif
} else {
#if ORTE_ENABLE_EPOCH
asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\" epoch=\"%s\"/>\n", pfx2,
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state), ORTE_EPOCH_PRINT(src->name.epoch));
#else
asprintf(output, "%s<process rank=\"%s\" pid=\"%d\" status=\"%s\"/>\n", pfx2,
ORTE_VPID_PRINT(src->name.vpid), (int)src->pid, orte_proc_state_to_str(src->state));
#endif
}
free(pfx2);
return ORTE_SUCCESS;
@ -490,10 +502,17 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
if (!orte_devel_level_output) {
/* just print a very simple output for users */
#if ORTE_ENABLE_EPOCH
asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: %s", pfx2,
ORTE_JOBID_PRINT(src->name.jobid),
ORTE_VPID_PRINT(src->name.vpid),
ORTE_EPOCH_PRINT(src->name.epoch));
#else
asprintf(&tmp, "\n%sProcess OMPI jobid: %s Process rank: %s Epoch: %s", pfx2,
ORTE_JOBID_PRINT(src->name.jobid),
ORTE_VPID_PRINT(src->name.vpid));
#endif
/* set the return */
*output = tmp;
free(pfx2);

Просмотреть файл

@ -45,9 +45,11 @@ int orte_dt_std_size(size_t *size, void *src, opal_data_type_t type)
*size = sizeof(orte_std_cntr_t);
break;
#if ORTE_ENABLE_EPOCH
case ORTE_EPOCH:
*size = sizeof(orte_epoch_t);
break;
#endif
case ORTE_VPID:
*size = sizeof(orte_vpid_t);

Просмотреть файл

@ -52,9 +52,14 @@ int orte_dt_compare_jobid(orte_jobid_t *value1,
int orte_dt_compare_vpid(orte_vpid_t *value1,
orte_vpid_t *value2,
opal_data_type_t type);
#if ORTE_ENABLE_EPOCH
int orte_dt_compare_epoch(orte_epoch_t *value1,
orte_epoch_t *value2,
opal_data_type_t type);
#define ORTE_EPOCH_CMP(n,m) ( (m) - (n) )
#else
#define ORTE_EPOCH_CMP(n,m) ( 0 )
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, opal_data_type_t type);
int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, opal_data_type_t type);
@ -86,7 +91,9 @@ int orte_dt_copy_std_cntr(orte_std_cntr_t **dest, orte_std_cntr_t *src, opal_dat
int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, opal_data_type_t type);
int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, opal_data_type_t type);
int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t type);
#if ORTE_ENABLE_EPOCH
int orte_dt_copy_epoch(orte_epoch_t **dest, orte_epoch_t *src, opal_data_type_t type);
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t type);
int orte_dt_copy_node(orte_node_t **dest, orte_node_t *src, opal_data_type_t type);
@ -116,8 +123,10 @@ int orte_dt_pack_jobid(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type);
int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type);
#if ORTE_ENABLE_EPOCH
int orte_dt_pack_epoch(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type);
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type);
@ -185,8 +194,10 @@ int orte_dt_unpack_jobid(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type);
int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type);
#if ORTE_ENABLE_EPOCH
int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type);
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type);

Просмотреть файл

@ -54,7 +54,9 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
orte_process_name_t* proc;
orte_jobid_t *jobid;
orte_vpid_t *vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t *epoch;
#endif
num = *num_vals;
@ -92,6 +94,7 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
return rc;
}
#if ORTE_ENABLE_EPOCH
/* collect all the epochs in a contiguous array */
epoch= (orte_epoch_t*)malloc(num * sizeof(orte_epoch_t));
if (NULL == epoch) {
@ -109,18 +112,21 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
free(jobid);
return rc;
}
#endif
/* build the names from the jobid/vpid/epoch arrays */
proc = (orte_process_name_t*)dest;
for (i=0; i < num; i++) {
proc->jobid = jobid[i];
proc->vpid = vpid[i];
proc->epoch = epoch[i];
ORTE_EPOCH_SET(proc->epoch,epoch[i]);
proc++;
}
/* cleanup */
#if ORTE_ENABLE_EPOCH
free(epoch);
#endif
free(vpid);
free(jobid);
@ -159,6 +165,7 @@ int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest,
return ret;
}
#if ORTE_ENABLE_EPOCH
/*
* EPOCH
*/
@ -174,6 +181,7 @@ int orte_dt_unpack_epoch(opal_buffer_t *buffer, void *dest,
return ret;
}
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
/*

Просмотреть файл

@ -220,7 +220,7 @@ static void process_message(int fd, short event, void *evdat)
data->port = port_name;
data->owner.jobid = sender->jobid;
data->owner.vpid = sender->vpid;
data->owner.epoch = sender->epoch;
ORTE_EPOCH_SET(data->owner.epoch,sender->epoch);
/* store the data */
data->index = opal_pointer_array_add(orte_data_server_store, data);

Просмотреть файл

@ -277,6 +277,7 @@ int orte_dt_init(void)
return rc;
}
#if ORTE_ENABLE_EPOCH
tmp = ORTE_EPOCH;
if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_epoch,
orte_dt_unpack_epoch,
@ -290,6 +291,7 @@ int orte_dt_init(void)
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
#if !ORTE_DISABLE_FULL_SUPPORT
tmp = ORTE_JOB;
@ -933,7 +935,7 @@ static void orte_proc_construct(orte_proc_t* proc)
proc->beat = 0;
OBJ_CONSTRUCT(&proc->stats, opal_ring_buffer_t);
opal_ring_buffer_init(&proc->stats, orte_stat_history_size);
proc->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_MIN);
#if OPAL_ENABLE_FT_CR == 1
proc->ckpt_state = 0;
proc->ckpt_snapshot_ref = NULL;

Просмотреть файл

@ -57,8 +57,17 @@ int orte_debug_verbosity;
char *orte_prohibited_session_dirs = NULL;
bool orte_create_session_dirs = true;
#if ORTE_ENABLE_EPOCH
orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD};
#else
orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
#endif
#if ORTE_ENABLE_EPOCH
orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_INVALID};
#else
orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
#endif
#if OPAL_CC_USE_PRAGMA_IDENT

Просмотреть файл

@ -204,7 +204,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
mev = OBJ_NEW(orte_message_event_t); \
mev->sender.jobid = (sndr)->jobid; \
mev->sender.vpid = (sndr)->vpid; \
mev->sender.epoch = (sndr)->epoch; \
ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \
opal_dss.copy_payload(mev->buffer, (buf)); \
mev->tag = (tg); \
mev->file = strdup((buf)->parent.cls_init_file_name); \
@ -228,7 +228,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
mev = OBJ_NEW(orte_message_event_t); \
mev->sender.jobid = (sndr)->jobid; \
mev->sender.vpid = (sndr)->vpid; \
mev->sender.epoch = (sndr)->epoch; \
ORTE_EPOCH_SET(mev->sender.epoch,(sndr)->epoch); \
opal_dss.copy_payload(mev->buffer, (buf)); \
mev->tag = (tg); \
opal_event_evtimer_set(opal_event_base, \
@ -258,7 +258,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_notify_event_t);
tmp = OBJ_NEW(orte_notify_event_t); \
tmp->proc.jobid = (data)->jobid; \
tmp->proc.vpid = (data)->vpid; \
tmp->proc.epoch = (data)->epoch; \
ORTE_EPOCH_SET(tmp->proc.epoch,(data)->epoch); \
opal_event.evtimer_set(opal_event_base, \
tmp->ev, (cbfunc), tmp); \
now.tv_sec = 0; \

Просмотреть файл

@ -74,8 +74,7 @@ main(int argc, char *argv[]){
for (j=1; j < count+1; j++) {
peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs;
peer.epoch = ORTE_EPOCH_INVALID;
peer.epoch = orte_ess.proc_get_epoch(&peer);
ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
/* rank0 starts ring */
if (ORTE_PROC_MY_NAME->vpid == 0) {

Просмотреть файл

@ -41,16 +41,14 @@ main(int argc, char *argv[]){
if( right_peer_orte_name.vpid >= num_peers ) {
right_peer_orte_name.vpid = 0;
}
right_peer_orte_name.epoch = ORTE_EPOCH_INVALID;
right_peer_orte_name.epoch = orte_ess.proc_get_epoch(&right_peer_orte_name);
ORTE_EPOCH_SET(right_peer_orte_name.epoch,orte_ess.proc_get_epoch(&right_peer_orte_name));
left_peer_orte_name.jobid = ORTE_PROC_MY_NAME->jobid;
left_peer_orte_name.vpid = ORTE_PROC_MY_NAME->vpid - 1;
if( ORTE_PROC_MY_NAME->vpid == 0 ) {
left_peer_orte_name.vpid = num_peers - 1;
}
left_peer_orte_name.epoch = ORTE_EPOCH_INVALID;
left_peer_orte_name.epoch = orte_ess.proc_get_epoch(&left_peer_orte_name);
ORTE_EPOCH_SET(left_peer_orte_name.epoch,orte_ess.proc_get_epoch(&left_peer_orte_name));
printf("My name is: %s -- PID %d\tMy Left Peer is %s\tMy Right Peer is %s\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), getpid(),

Просмотреть файл

@ -74,8 +74,8 @@ int main(int argc, char* argv[])
for (i=0; i < app->num_procs; i++) {
name.vpid = i;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
ORTE_EPOCH_SET(name.epoch,orte_ess.proc_get_epoch(&name));
fprintf(stderr, "Parent: sending message to child %s\n", ORTE_NAME_PRINT(&name));
if (0 > (rc = orte_rml.send(&name, &msg, 1, MY_TAG, 0))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -869,8 +869,14 @@ static int gather_vpid_info(orte_ps_mpirun_info_t *hnpinfo) {
}
/* query the HNP for info on the procs in this job */
if (ORTE_SUCCESS != (ret = orte_util_comm_query_proc_info(&(hnpinfo->hnp->name), job->jobid,
ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD, &cnt, &procs))) {
if (ORTE_SUCCESS != (ret = orte_util_comm_query_proc_info(&(hnpinfo->hnp->name),
job->jobid,
ORTE_VPID_WILDCARD,
#if ORTE_ENABLE_EPOCH
ORTE_EPOCH_WILDCARD,
#endif
&cnt,
&procs))) {
ORTE_ERROR_LOG(ret);
}
job->procs->addr = (void**)procs;

Просмотреть файл

@ -471,7 +471,7 @@ main(int argc, char *argv[])
if (NULL == ranks) {
/* take all ranks */
proc.vpid = ORTE_VPID_WILDCARD;
proc.epoch = ORTE_EPOCH_WILDCARD;
ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_WILDCARD);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&cmdbuf, &proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
goto cleanup;

Просмотреть файл

@ -433,8 +433,13 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
return ORTE_SUCCESS;
}
#if ORTE_ENABLE_EPOCH
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid,
orte_epoch_t epoch, int *num_procs, orte_proc_t ***proc_info_array)
#else
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid,
int *num_procs, orte_proc_t ***proc_info_array)
#endif
{
int ret;
int32_t cnt, cnt_procs, n;
@ -463,11 +468,13 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
OBJ_RELEASE(cmd);
return ret;
}
#if ORTE_ENABLE_EPOCH
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
}
#endif
/* define a max time to wait for send to complete */
timer_fired = false;
error_exit = ORTE_SUCCESS;

Просмотреть файл

@ -52,7 +52,10 @@ ORTE_DECLSPEC int orte_util_comm_query_node_info(const orte_process_name_t *hnp,
int *num_nodes, orte_node_t ***node_info_array);
ORTE_DECLSPEC int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid,
orte_epoch_t epoch, int *num_procs, orte_proc_t ***proc_info_array);
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch,
#endif
int *num_procs, orte_proc_t ***proc_info_array);
ORTE_DECLSPEC int orte_util_comm_spawn_job(const orte_process_name_t *hnp, orte_job_t *jdata);

Просмотреть файл

@ -55,7 +55,8 @@ static void orte_hnp_contact_construct(orte_hnp_contact_t *ptr)
{
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(ptr->name.epoch,ORTE_EPOCH_MIN);
ptr->rml_uri = NULL;
}
static void orte_hnp_contact_destruct(orte_hnp_contact_t *ptr)

Просмотреть файл

@ -46,7 +46,7 @@ static void orte_namelist_construct(orte_namelist_t* list)
{
list->name.jobid = ORTE_JOBID_INVALID;
list->name.vpid = ORTE_VPID_INVALID;
list->name.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(list->name.epoch,ORTE_EPOCH_MIN);
}
/* destructor - used to free any resources held by instance */
@ -116,7 +116,10 @@ get_print_name_buffer(void)
char* orte_util_print_name_args(const orte_process_name_t *name)
{
orte_print_args_buffers_t *ptr;
char *job, *vpid, *epoch;
char *job, *vpid;
#if ORTE_ENABLE_EPOCH
char *epoch;
#endif
/* protect against NULL names */
if (NULL == name) {
@ -141,7 +144,7 @@ char* orte_util_print_name_args(const orte_process_name_t *name)
*/
job = orte_util_print_jobids(name->jobid);
vpid = orte_util_print_vpids(name->vpid);
epoch = orte_util_print_epoch(name->epoch);
ORTE_EPOCH_SET(epoch,orte_util_print_epoch(name->epoch));
/* get the next buffer */
ptr = get_print_name_buffer();
@ -156,9 +159,15 @@ char* orte_util_print_name_args(const orte_process_name_t *name)
ptr->cntr = 0;
}
#if ORTE_ENABLE_EPOCH
snprintf(ptr->buffers[ptr->cntr++],
ORTE_PRINT_NAME_ARGS_MAX_SIZE,
"[%s,%s,%s]", job, vpid, epoch);
#else
snprintf(ptr->buffers[ptr->cntr++],
ORTE_PRINT_NAME_ARGS_MAX_SIZE,
"[%s,%s]", job, vpid);
#endif
return ptr->buffers[ptr->cntr-1];
}
@ -282,6 +291,7 @@ char* orte_util_print_vpids(const orte_vpid_t vpid)
return ptr->buffers[ptr->cntr-1];
}
#if ORTE_ENABLE_EPOCH
char* orte_util_print_epoch(const orte_epoch_t epoch)
{
orte_print_args_buffers_t *ptr;
@ -309,6 +319,7 @@ char* orte_util_print_epoch(const orte_epoch_t epoch)
}
return ptr->buffers[ptr->cntr-1];
}
#endif
@ -403,6 +414,7 @@ int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring)
return ORTE_SUCCESS;
}
#if ORTE_ENABLE_EPOCH
int orte_util_convert_epoch_to_string(char **epoch_string, const orte_epoch_t epoch)
{
/* check for wildcard value - handle appropriately */
@ -425,7 +437,6 @@ int orte_util_convert_epoch_to_string(char **epoch_string, const orte_epoch_t ep
return ORTE_SUCCESS;
}
int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* epoch_string)
{
if (NULL == epoch_string) { /* got an error */
@ -450,6 +461,7 @@ int orte_util_convert_string_to_epoch(orte_epoch_t *epoch, const char* epoch_str
return ORTE_SUCCESS;
}
#endif
int orte_util_convert_string_to_process_name(orte_process_name_t *name,
const char* name_string)
@ -457,13 +469,15 @@ int orte_util_convert_string_to_process_name(orte_process_name_t *name,
char *temp, *token;
orte_jobid_t job;
orte_vpid_t vpid;
#if ORTE_ENABLE_EPOCH
orte_epoch_t epoch;
#endif
int return_code=ORTE_SUCCESS;
/* set default */
name->jobid = ORTE_JOBID_INVALID;
name->vpid = ORTE_VPID_INVALID;
name->epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(name->epoch,ORTE_EPOCH_MIN);
/* check for NULL string - error */
if (NULL == name_string) {
@ -510,6 +524,7 @@ int orte_util_convert_string_to_process_name(orte_process_name_t *name,
vpid = strtoul(token, NULL, 10);
}
#if ORTE_ENABLE_EPOCH
token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> epoch*/
/* check for error */
@ -528,10 +543,11 @@ int orte_util_convert_string_to_process_name(orte_process_name_t *name,
} else {
epoch = strtoul(token, NULL, 10);
}
#endif
name->jobid = job;
name->vpid = vpid;
name->epoch = epoch;
ORTE_EPOCH_SET(name->epoch,epoch);
free(temp);
@ -568,6 +584,7 @@ int orte_util_convert_process_name_to_string(char **name_string,
asprintf(&tmp2, "%s%c%lu", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned long)name->vpid);
}
#if ORTE_ENABLE_EPOCH
if (ORTE_EPOCH_WILDCARD == name->epoch) {
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_EPOCH_INVALID == name->epoch) {
@ -575,6 +592,10 @@ int orte_util_convert_process_name_to_string(char **name_string,
} else {
asprintf(name_string, "%s%c%lu", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, (unsigned long)name->epoch);
}
#else
asprintf(name_string, "%s", tmp2);
#endif
free(tmp);
free(tmp2);
@ -585,8 +606,11 @@ int orte_util_convert_process_name_to_string(char **name_string,
/**** CREATE PROCESS NAME ****/
int orte_util_create_process_name(orte_process_name_t **name,
orte_jobid_t job,
orte_vpid_t vpid,
orte_epoch_t epoch)
orte_vpid_t vpid
#if ORTE_ENABLE_EPOCH
,orte_epoch_t epoch
#endif
)
{
*name = NULL;
@ -598,7 +622,8 @@ int orte_util_create_process_name(orte_process_name_t **name,
(*name)->jobid = job;
(*name)->vpid = vpid;
(*name)->epoch = epoch;
ORTE_EPOCH_SET((*name)->epoch,epoch);
return ORTE_SUCCESS;
}
@ -655,6 +680,7 @@ int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields,
}
}
#if ORTE_ENABLE_EPOCH
/* Get here if jobid's and vpid's are equal, or not being checked.
* Now check epoch.
*/
@ -666,6 +692,7 @@ int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields,
return OPAL_VALUE1_GREATER;
}
}
#endif
/* only way to get here is if all fields are being checked and are equal,
* or jobid not checked, but vpid equal,

Просмотреть файл

@ -61,9 +61,13 @@ ORTE_DECLSPEC char* orte_util_print_vpids(const orte_vpid_t vpid);
#define ORTE_VPID_PRINT(n) \
orte_util_print_vpids(n)
#if ORTE_ENABLE_EPOCH
ORTE_DECLSPEC char* orte_util_print_epoch(const orte_epoch_t epoch);
#define ORTE_EPOCH_PRINT(n) \
orte_util_print_epoch(n)
#else
#define ORTE_EPOCH_PRINT(n)
#endif
ORTE_DECLSPEC char* orte_util_print_job_family(const orte_jobid_t job);
#define ORTE_JOB_FAMILY_PRINT(n) \
@ -104,6 +108,24 @@ ORTE_DECLSPEC char *orte_pretty_print_timing(int64_t secs, int64_t usecs);
#define ORTE_JOBID_IS_DAEMON(n) \
!((n) & 0x0000ffff)
/* Macro for getting the epoch out of the process name */
#if ORTE_ENABLE_EPOCH
#define ORTE_EPOCH_GET(n) \
((n)->epoch)
#else
#define ORTE_EPOCH_GET(n)
#endif
/* Macro for setting the epoch in the process name */
#if ORTE_ENABLE_EPOCH
#define ORTE_EPOCH_SET(n,m) \
( (n) = (m) )
#else
#define ORTE_EPOCH_SET(n,m) \
do { \
} while(0);
#endif
/* List of names for general use */
struct orte_namelist_t {
opal_list_item_t item; /**< Allows this item to be placed on a list */
@ -117,16 +139,24 @@ ORTE_DECLSPEC int orte_util_convert_jobid_to_string(char **jobid_string, const o
ORTE_DECLSPEC int orte_util_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring);
ORTE_DECLSPEC int orte_util_convert_vpid_to_string(char **vpid_string, const orte_vpid_t vpid);
ORTE_DECLSPEC int orte_util_convert_string_to_vpid(orte_vpid_t *vpid, const char* vpidstring);
#if ORTE_ENABLE_EPOCH
ORTE_DECLSPEC int orte_util_convert_epoch_to_string(char **epoch_string, const orte_epoch_t epoch);
ORTE_DECLSPEC int orte_util_convert_string_to_epoch(orte_vpid_t *epoch, const char* epochstring);
#endif
ORTE_DECLSPEC int orte_util_convert_string_to_process_name(orte_process_name_t *name,
const char* name_string);
ORTE_DECLSPEC int orte_util_convert_process_name_to_string(char** name_string,
const orte_process_name_t *name);
#if ORTE_ENABLE_EPOCH
ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name,
orte_jobid_t job,
orte_vpid_t vpid,
orte_epoch_t epoch);
#else
ORTE_DECLSPEC int orte_util_create_process_name(orte_process_name_t **name,
orte_jobid_t job,
orte_vpid_t vpid);
#endif
ORTE_DECLSPEC int orte_util_compare_name_fields(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
const orte_process_name_t* name2);

Просмотреть файл

@ -249,7 +249,7 @@ int orte_util_build_daemon_nidmap(char **nodes)
*/
/* construct the URI */
proc.vpid = node->daemon;
proc.epoch = ORTE_EPOCH_MIN;
ORTE_EPOCH_SET(proc.epoch,ORTE_EPOCH_MIN);
orte_util_convert_process_name_to_string(&proc_name, &proc);
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
@ -1001,6 +1001,7 @@ void print_orte_job_data() {
}
#endif
#if ORTE_ENABLE_EPOCH
/* Look up the current epoch value that we have stored locally.
*
* Note that this will not ping the HNP to get the most up to date epoch stored
@ -1023,7 +1024,9 @@ orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, orte_epoch_t epoch)
/*print_orte_job_data();*/
return e;
}
#endif
#if ORTE_RESIL_ORTE
bool orte_util_proc_is_running(orte_process_name_t *proc) {
int i;
unsigned int j;
@ -1078,7 +1081,9 @@ int orte_util_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state)
return ORTE_ERROR;
}
#endif
#if ORTE_ENABLE_EPOCH
/*
* This function performs both the get and set operations on the epoch for a
* sepcific process name. If the epoch passed into the function is
@ -1091,6 +1096,11 @@ orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, orte_epoch_
orte_job_t *jdata;
orte_proc_t *pdata;
if (ORTE_JOBID_INVALID == proc->jobid ||
ORTE_VPID_INVALID == proc->vpid) {
return ORTE_EPOCH_INVALID;
}
/* Sanity check just to make sure we don't overwrite our existing
* orte_job_data.
*/
@ -1165,4 +1175,5 @@ orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, orte_epoch_
return ORTE_EPOCH_MIN;
}
}
#endif

Просмотреть файл

@ -48,11 +48,19 @@ ORTE_DECLSPEC orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job);
ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc);
ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc);
#if ORTE_ENABLE_EPOCH
ORTE_DECLSPEC orte_epoch_t orte_util_lookup_epoch(orte_process_name_t *proc);
ORTE_DECLSPEC orte_epoch_t orte_util_set_epoch(orte_process_name_t *proc, orte_epoch_t epoch);
#endif
ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state);
#if ORTE_RESIL_ORTE
#define PROC_IS_RUNNING(n) orte_util_proc_is_running(n)
ORTE_DECLSPEC bool orte_util_proc_is_running(orte_process_name_t *proc);
#else
#define PROC_IS_RUNNING(n) ( true )
#endif
ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr);
ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr);
@ -72,5 +80,8 @@ ORTE_DECLSPEC void orte_jobmap_dump(void);
END_C_DECLS
/* Local functions */
#if ORTE_ENABLE_EPOCH
orte_epoch_t get_epoch_from_orte_job_data(orte_process_name_t *proc, orte_epoch_t epoch);
#endif
#endif

Просмотреть файл

@ -36,13 +36,19 @@
#include "orte/util/proc_info.h"
#if ORTE_ENABLE_EPOCH
#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_MIN}
#else
#define ORTE_NAME_INVALID {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}
#endif
ORTE_DECLSPEC orte_proc_info_t orte_process_info = {
/* .my_name = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_MIN},
/* .my_daemon = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_MIN},
/* .my_name = */ ORTE_NAME_INVALID,
/* .my_daemon = */ ORTE_NAME_INVALID,
/* .my_daemon_uri = */ NULL,
/* .my_hnp = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_MIN},
/* .my_hnp = */ ORTE_NAME_INVALID,
/* .my_hnp_uri = */ NULL,
/* .my_parent = */ {ORTE_JOBID_INVALID, ORTE_VPID_INVALID, ORTE_EPOCH_MIN},
/* .my_parent = */ ORTE_NAME_INVALID,
/* .hnp_pid = */ 0,
/* .app_num = */ 0,
/* .num_procs = */ 1,

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше