Add a resilience to ORTE. Allows the runtime to continue after a process (or
ORTED) failure. Note that more work will be necessary to allow the MPI layer to take advantage of this. Per RFC: http://www.open-mpi.org/community/lists/devel/2011/06/9299.php This commit was SVN r24815.
Этот коммит содержится в:
родитель
e8817f3f63
Коммит
e1ba09ad51
1
AUTHORS
1
AUTHORS
@ -85,6 +85,7 @@ tprins Tim Prins IU, LANL
|
||||
twoodall Tim Woodall LANL
|
||||
vasily Vasily Filipov Mellanox
|
||||
vsahay Vishal Sahay IU
|
||||
wbland Wesley Bland UTK
|
||||
yuw Weikuan Yu LANL, OSU
|
||||
|
||||
Affiliaion abbreviations:
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -234,6 +234,12 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
|
||||
return new_errhandler;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runtime errhandler callback
|
||||
*/
|
||||
void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs) {
|
||||
ompi_mpi_abort(MPI_COMM_WORLD, 1, false);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -35,6 +35,8 @@
|
||||
#include "ompi/errhandler/errhandler_predefined.h"
|
||||
#include "ompi/errhandler/errcode-internal.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
@ -358,6 +360,19 @@ struct ompi_request_t;
|
||||
OMPI_DECLSPEC ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
|
||||
ompi_errhandler_generic_handler_fn_t *func,
|
||||
ompi_errhandler_lang_t language);
|
||||
|
||||
/**
|
||||
* Callback function from runtime layer to alert the MPI layer of an error at
|
||||
* the runtime layer.
|
||||
*
|
||||
* @param procs The names of the processes that have failed.
|
||||
*
|
||||
* This function is used to alert the MPI layer to a specific fault at the
|
||||
* runtime layer. Currently, the only faults reported using this method are
|
||||
* process failures. The MPI layer has the option to perform whatever actions it
|
||||
* needs to stabalize itself and continue running, abort, etc.
|
||||
*/
|
||||
OMPI_DECLSPEC void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs);
|
||||
|
||||
/**
|
||||
* Check to see if an errhandler is intrinsic.
|
||||
|
@ -660,8 +660,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
|
||||
bool found = false;
|
||||
|
||||
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
|
||||
"jobid %d, vpid %d, sid %" PRIx64 ", lid %d",
|
||||
process_name->jobid, process_name->vpid, subnet_id, lid));
|
||||
"jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d",
|
||||
process_name->jobid, process_name->vpid, process_name->epoch, subnet_id, lid));
|
||||
/* find ibproc */
|
||||
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||
for (ib_proc = (mca_btl_openib_proc_t*)
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -1208,6 +1208,7 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
peer = OBJ_NEW(orte_namelist_t);
|
||||
peer->name.jobid = comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid;
|
||||
peer->name.vpid = comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid;
|
||||
peer->name.epoch = comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch;
|
||||
opal_list_append(&peers, &peer->item);
|
||||
}
|
||||
/* prepare send data */
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 The University of Tennessee and The University
|
||||
* Copyright (c) 2010-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -35,6 +35,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/dpm/dpm.h"
|
||||
@ -701,6 +702,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
|
||||
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
peer_rev->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
|
||||
@ -728,6 +730,7 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
|
||||
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
peer_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
|
||||
HOKE_TRAFFIC_MSG_REF_RETURN(item);
|
||||
@ -837,6 +840,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->matched = INVALID_INT;
|
||||
msg_ref->done = INVALID_INT;
|
||||
@ -864,6 +868,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->matched = INVALID_INT;
|
||||
msg_ref->done = INVALID_INT;
|
||||
@ -897,6 +902,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->done = INVALID_INT;
|
||||
msg_ref->active = INVALID_INT;
|
||||
@ -928,6 +934,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
msg_ref->done = INVALID_INT;
|
||||
msg_ref->active = INVALID_INT;
|
||||
@ -947,6 +954,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
|
||||
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
|
||||
}
|
||||
|
||||
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
|
||||
@ -954,6 +962,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
|
||||
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
|
||||
}
|
||||
|
||||
|
||||
@ -1006,7 +1015,7 @@ do { \
|
||||
}
|
||||
|
||||
|
||||
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
|
||||
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
|
||||
{ \
|
||||
HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \
|
||||
\
|
||||
@ -1025,6 +1034,7 @@ do { \
|
||||
\
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
msg_ref->proc_name.epoch = p_epoch; \
|
||||
\
|
||||
msg_ref->matched = 0; \
|
||||
msg_ref->done = 0; \
|
||||
@ -1033,7 +1043,7 @@ do { \
|
||||
msg_ref->active_drain = 0; \
|
||||
}
|
||||
|
||||
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
|
||||
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
|
||||
{ \
|
||||
HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \
|
||||
\
|
||||
@ -1053,6 +1063,7 @@ do { \
|
||||
\
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
msg_ref->proc_name.epoch = p_epoch; \
|
||||
}
|
||||
|
||||
|
||||
@ -1455,6 +1466,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
|
||||
|
||||
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
|
||||
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
|
||||
new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch;
|
||||
|
||||
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
|
||||
}
|
||||
@ -3225,7 +3237,8 @@ static int traffic_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
CREATE_NEW_MSG((*msg_ref), msg_type,
|
||||
count, ddt_size, tag, dest, comm,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
peer_ref->proc_name.vpid
|
||||
peer_ref->proc_name.epoch);
|
||||
} else {
|
||||
CREATE_NEW_MSG((*msg_ref), msg_type,
|
||||
count, ddt_size, tag, dest, comm,
|
||||
@ -3364,6 +3377,7 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
|
||||
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
|
||||
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
|
||||
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
|
||||
(*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch;
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
@ -3794,7 +3808,8 @@ static int drain_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type,
|
||||
count, NULL, tag, dest, comm,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
peer_ref->proc_name.vpid
|
||||
peer_ref->proc_name.epoch);
|
||||
|
||||
(*msg_ref)->done = 0;
|
||||
(*msg_ref)->active = 0;
|
||||
@ -4142,6 +4157,7 @@ static int drain_message_copy_remove(ompi_crcp_bkmrk_pml_drain_message_ref_t *dr
|
||||
static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
|
||||
{
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(&ompi_crcp_bkmrk_pml_peer_refs);
|
||||
item != opal_list_get_end(&ompi_crcp_bkmrk_pml_peer_refs);
|
||||
@ -4149,7 +4165,9 @@ static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
|
||||
ompi_crcp_bkmrk_pml_peer_ref_t *cur_peer_ref;
|
||||
cur_peer_ref = (ompi_crcp_bkmrk_pml_peer_ref_t*)item;
|
||||
|
||||
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
|
||||
if( OPAL_EQUAL == orte_util_compare_name_fields(mask,
|
||||
&(cur_peer_ref->proc_name),
|
||||
&proc) ) {
|
||||
return cur_peer_ref;
|
||||
@ -5266,6 +5284,7 @@ static int send_bookmarks(int peer_idx)
|
||||
*/
|
||||
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
|
||||
|
||||
if( NULL == (peer_ref = find_peer(peer_name))) {
|
||||
opal_output(mca_crcp_bkmrk_component.super.output_handle,
|
||||
@ -5326,6 +5345,7 @@ static int recv_bookmarks(int peer_idx)
|
||||
|
||||
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
|
||||
|
||||
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
|
||||
OMPI_CRCP_COORD_BOOKMARK_TAG,
|
||||
@ -5507,6 +5527,7 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
|
||||
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
|
||||
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
|
||||
d_msg_ack->peer.epoch = peer_ref->proc_name.epoch;
|
||||
d_msg_ack->complete = false;
|
||||
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle,
|
||||
@ -6146,7 +6167,8 @@ static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t *peer_r
|
||||
count, datatype_size, tag, rank,
|
||||
ompi_comm_lookup(comm_id),
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
peer_ref->proc_name.vpid
|
||||
peer_ref->proc_name.epoch);
|
||||
|
||||
traffic_message_create_drain_message(true, num_left_unresolved,
|
||||
peer_ref,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -1130,6 +1130,7 @@ static void process_cb(int fd, short event, void *data)
|
||||
/* flag the identity of the remote proc */
|
||||
carport.jobid = mev->sender.jobid;
|
||||
carport.vpid = mev->sender.vpid;
|
||||
carport.epoch = mev->sender.epoch;
|
||||
|
||||
/* release the event */
|
||||
OBJ_RELEASE(mev);
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -395,12 +398,13 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
|
||||
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
|
||||
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
|
||||
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
|
||||
orte_proc.epoch = hdr->hdr_restart.hdr_epoch;
|
||||
ompi_proc = ompi_proc_find(&orte_proc);
|
||||
opal_output_verbose(20, mca_pml_bfo_output,
|
||||
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
|
||||
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
|
||||
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
|
||||
"hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
|
||||
"hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, ompi_proc->proc_hostname=%s",
|
||||
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
|
||||
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||||
@ -408,8 +412,8 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
|
||||
hdr->hdr_restart.hdr_restartseq,
|
||||
recvreq->remote_req_send.pval, (void *)recvreq,
|
||||
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||||
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
|
||||
ompi_proc->proc_hostname);
|
||||
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
|
||||
hdr->hdr_restart.hdr_epoch, ompi_proc->proc_hostname);
|
||||
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
|
||||
return;
|
||||
}
|
||||
@ -711,6 +715,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
|
||||
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
|
||||
restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
|
||||
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -415,6 +415,7 @@ struct mca_pml_bfo_restart_hdr_t {
|
||||
int32_t hdr_dst_rank; /**< needed to send NACK */
|
||||
uint32_t hdr_jobid; /**< needed to send NACK */
|
||||
uint32_t hdr_vpid; /**< needed to send NACK */
|
||||
uint32_t hdr_epoch; /**< needed to send NACK */
|
||||
};
|
||||
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
|
||||
@ -427,6 +428,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
|
||||
(h).hdr_jobid = ntohl((h).hdr_jobid); \
|
||||
(h).hdr_vpid = ntohl((h).hdr_vpid); \
|
||||
(h).hdr_epoch = ntohl((h).hdr_epoch); \
|
||||
} while (0)
|
||||
|
||||
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
|
||||
@ -435,6 +437,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
|
||||
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
|
||||
(h).hdr_jobid = htonl((h).hdr_jobid); \
|
||||
(h).hdr_vpid = htonl((h).hdr_vpid); \
|
||||
(h).hdr_epoch = htonl((h).hdr_epoch); \
|
||||
} while (0)
|
||||
|
||||
#endif /* PML_BFO */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
|
||||
@ -108,6 +108,7 @@ int ompi_proc_init(void)
|
||||
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
proc->proc_name.vpid = i;
|
||||
proc->proc_name.epoch = ORTE_EPOCH_MIN;
|
||||
if (i == ORTE_PROC_MY_NAME->vpid) {
|
||||
ompi_proc_local_proc = proc;
|
||||
proc->proc_flags = OPAL_PROC_ALL_LOCAL;
|
||||
@ -361,6 +362,8 @@ int ompi_proc_refresh(void) {
|
||||
|
||||
/* Does not change: proc->proc_name.vpid */
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
|
||||
|
||||
/* Make sure to clear the local flag before we set it below */
|
||||
proc->proc_flags = 0;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -67,6 +67,7 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/info/info.h"
|
||||
#include "ompi/errhandler/errcode.h"
|
||||
#include "ompi/errhandler/errhandler.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "ompi/mca/op/op.h"
|
||||
@ -369,6 +370,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
gettimeofday(&ompistart, NULL);
|
||||
}
|
||||
|
||||
/* Register errhandler callback with orte errmgr */
|
||||
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
|
||||
|
||||
/* Figure out the final MPI thread levels. If we were not
|
||||
compiled for support for MPI threads, then don't allow
|
||||
MPI_THREAD_MULTIPLE. Set this stuff up here early in the
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -81,28 +81,36 @@ typedef uint32_t orte_vpid_t;
|
||||
#define ORTE_VPID_T OPAL_UINT32
|
||||
#define ORTE_VPID_MAX UINT32_MAX-2
|
||||
#define ORTE_VPID_MIN 0
|
||||
typedef uint32_t orte_epoch_t;
|
||||
#define ORTE_EPOCH_T OPAL_UINT32
|
||||
#define ORTE_EPOCH_MAX UINT32_MAX-2
|
||||
#define ORTE_EPOCH_MIN 0
|
||||
|
||||
#define ORTE_PROCESS_NAME_HTON(n) \
|
||||
do { \
|
||||
n.jobid = htonl(n.jobid); \
|
||||
n.vpid = htonl(n.vpid); \
|
||||
n.epoch = htonl(n.epoch); \
|
||||
} while (0)
|
||||
|
||||
#define ORTE_PROCESS_NAME_NTOH(n) \
|
||||
do { \
|
||||
n.jobid = ntohl(n.jobid); \
|
||||
n.vpid = ntohl(n.vpid); \
|
||||
n.epoch = ntohl(n.epoch); \
|
||||
} while (0)
|
||||
|
||||
#define ORTE_NAME_ARGS(n) \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid)
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch)
|
||||
|
||||
/*
|
||||
* define invalid values
|
||||
*/
|
||||
#define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2)
|
||||
#define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2)
|
||||
#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2)
|
||||
#define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF)
|
||||
|
||||
/*
|
||||
@ -110,6 +118,7 @@ do { \
|
||||
*/
|
||||
#define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1)
|
||||
#define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1)
|
||||
#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1)
|
||||
#define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF)
|
||||
|
||||
/*
|
||||
@ -118,6 +127,14 @@ do { \
|
||||
struct orte_process_name_t {
|
||||
orte_jobid_t jobid; /**< Job number */
|
||||
orte_vpid_t vpid; /**< Process id - equivalent to rank */
|
||||
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
|
||||
* The epoch will start at ORTE_EPOCH_MIN and
|
||||
* increment every time the process is detected as
|
||||
* having stopped (including normal shutdown). The
|
||||
* HNP will be responsible for informing all
|
||||
* processes that did not directly detect the
|
||||
* failure to increment their epochs.
|
||||
*/
|
||||
};
|
||||
typedef struct orte_process_name_t orte_process_name_t;
|
||||
|
||||
@ -140,35 +157,35 @@ typedef void* orte_iov_base_ptr_t;
|
||||
#define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an orte_process_name_t */
|
||||
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
|
||||
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
|
||||
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* State-related types */
|
||||
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 5) /**< node status flag */
|
||||
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< process/resource status */
|
||||
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< job status flag */
|
||||
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 8) /**< process exit code */
|
||||
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */
|
||||
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< process/resource status */
|
||||
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 8) /**< job status flag */
|
||||
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 9) /**< process exit code */
|
||||
/* Data-passing types */
|
||||
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 9) /**< registry return value */
|
||||
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 10) /**< registry return value */
|
||||
/* Resource types */
|
||||
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 10) /**< argv and enviro arrays */
|
||||
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 11) /**< describes capabilities of nodes */
|
||||
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes slot allocations/reservations */
|
||||
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 13) /**< job information */
|
||||
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 14) /**< node information */
|
||||
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 15) /**< process information */
|
||||
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 16) /**< map of process locations */
|
||||
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 11) /**< argv and enviro arrays */
|
||||
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes capabilities of nodes */
|
||||
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 13) /**< describes slot allocations/reservations */
|
||||
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 14) /**< job information */
|
||||
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 15) /**< node information */
|
||||
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 16) /**< process information */
|
||||
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 17) /**< map of process locations */
|
||||
|
||||
/* RML types */
|
||||
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 17) /**< tag for sending/receiving messages */
|
||||
|
||||
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 18) /**< tag for sending/receiving messages */
|
||||
/* DAEMON command type */
|
||||
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 18) /**< command flag for communicating with the daemon */
|
||||
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */
|
||||
|
||||
/* GRPCOMM types */
|
||||
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 19)
|
||||
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20)
|
||||
|
||||
/* IOF types */
|
||||
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20)
|
||||
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21)
|
||||
|
||||
|
||||
/* provide a boundary for others to use */
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -383,6 +386,7 @@ static void recv_cmd(int status,
|
||||
dat = OBJ_NEW(orte_db_data_t);
|
||||
dat->name.jobid = sender->jobid;
|
||||
dat->name.vpid = sender->vpid;
|
||||
dat->name.epoch= sender->epoch;
|
||||
dat->key = key;
|
||||
count=1;
|
||||
opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32);
|
||||
|
@ -1,9 +1,13 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -22,11 +26,15 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
@ -48,9 +56,22 @@ static int update_state(orte_jobid_t job,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
static int post_startup(void);
|
||||
static int pre_shutdown(void);
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata);
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
@ -64,7 +85,12 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
post_startup,
|
||||
pre_shutdown,
|
||||
NULL,
|
||||
orte_errmgr_base_set_fault_callback,
|
||||
NULL
|
||||
};
|
||||
|
||||
/************************
|
||||
@ -87,6 +113,8 @@ static int update_state(orte_jobid_t job,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
@ -104,9 +132,9 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
/* if it is our own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
|
||||
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -120,6 +148,95 @@ static int update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int post_startup(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE,
|
||||
ORTE_RML_PERSISTENT,
|
||||
epoch_change_recv,
|
||||
NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int pre_shutdown(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata) {
|
||||
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
|
||||
}
|
||||
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data) {
|
||||
orte_message_event_t *mev = (orte_message_event_t *) data;
|
||||
opal_buffer_t *buffer = mev->buffer;
|
||||
orte_process_name_t *proc;
|
||||
int n = 1, ret, num_dead, i;
|
||||
opal_pointer_array_t *procs;
|
||||
|
||||
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Received epoch change notification",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
procs = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
|
||||
for (i = 0; i < num_dead; i++) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
proc[i].epoch++;
|
||||
orte_util_set_epoch(&proc[i], proc[i].epoch);
|
||||
|
||||
opal_pointer_array_add(procs, &proc[i]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Epoch for %s updated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc[i])));
|
||||
}
|
||||
|
||||
if (NULL != fault_cbfunc && 0 < num_dead) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
(*fault_cbfunc)(procs);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback failed!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
free(proc);
|
||||
OBJ_RELEASE(procs);
|
||||
}
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
@ -161,7 +278,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
return exit_status;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -105,6 +105,7 @@ ORTE_DECLSPEC void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, o
|
||||
/*
|
||||
* Additional External API function declared in errmgr.h
|
||||
*/
|
||||
ORTE_DECLSPEC orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -97,11 +97,13 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
|
||||
@ -137,11 +139,13 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
|
||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->node_name = NULL;
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->map_node_name = NULL;
|
||||
@ -152,6 +156,7 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->node_name ) {
|
||||
@ -160,6 +165,7 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
}
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->map_node_name ) {
|
||||
@ -678,6 +684,18 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
|
||||
|
||||
#endif
|
||||
|
||||
orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc) {
|
||||
orte_errmgr_fault_callback_t *temp_cbfunc = fault_cbfunc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s errmgr:base Called set_fault_callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
fault_cbfunc = cbfunc;
|
||||
|
||||
return temp_cbfunc;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Local Functions
|
||||
********************/
|
||||
|
@ -55,6 +55,8 @@ orte_errmgr_base_t orte_errmgr_base;
|
||||
|
||||
orte_errmgr_base_component_t orte_errmgr_base_selected_component;
|
||||
|
||||
orte_errmgr_fault_callback_t *fault_cbfunc;
|
||||
|
||||
/* Public module provides a wrapper around previous functions */
|
||||
orte_errmgr_base_module_t orte_errmgr = {
|
||||
NULL, /* init */
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -264,6 +267,7 @@ static int errmgr_base_tool_start_cmdline_listener(void)
|
||||
*/
|
||||
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
|
||||
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
|
||||
errmgr_cmdline_sender.epoch = ORTE_EPOCH_INVALID;
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_MIGRATE,
|
||||
0,
|
||||
@ -375,12 +379,14 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
|
||||
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
||||
swap_dest.jobid = errmgr_cmdline_sender.jobid;
|
||||
swap_dest.vpid = errmgr_cmdline_sender.vpid;
|
||||
swap_dest.epoch = errmgr_cmdline_sender.epoch;
|
||||
|
||||
errmgr_cmdline_sender = *sender;
|
||||
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
|
||||
|
||||
errmgr_cmdline_sender.jobid = swap_dest.jobid;
|
||||
errmgr_cmdline_sender.vpid = swap_dest.vpid;
|
||||
errmgr_cmdline_sender.epoch = swap_dest.epoch;
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -49,6 +49,7 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
@ -90,6 +91,22 @@ struct orte_errmgr_predicted_node_t {
|
||||
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
|
||||
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
|
||||
|
||||
/*
|
||||
* Callback function that should be called when there is a fault.
|
||||
*
|
||||
* This callback function will be used anytime (other than during finalize) the
|
||||
* runtime detects and handles a process failure. The runtime will complete all
|
||||
* its stabilization before alerting the callback function. The parameter to the
|
||||
* callback function will be the orte_process_name_t of the process that failed.
|
||||
* It will not alert the application to failures that are not in the same job as
|
||||
* the alerted process, only failures within the same jobid.
|
||||
*
|
||||
* @param[in] proc The names of the process that failed
|
||||
*/
|
||||
typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
|
||||
|
||||
/*
|
||||
* Structure to describe a suggested remapping element for a predicted fault.
|
||||
*
|
||||
@ -242,42 +259,100 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
|
||||
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
||||
|
||||
/**
|
||||
* Register a callback to alert caller when ORTE is preparing to
|
||||
* migrate the process to another location. This provides an
|
||||
* opportunity for the process to checkpoint any required state,
|
||||
* and to cleanly shutdown.
|
||||
* Function to perform actions that require the rest of the ORTE layer to be up
|
||||
* and running.
|
||||
*
|
||||
* @param[in] delay Time to delay before assuming process is stuck
|
||||
* and cannot exit on its own - and thus, go
|
||||
* ahead and migrate it
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR An unspecified error occured
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
|
||||
|
||||
/*
|
||||
* This function gets called just after startup is finished. It gives the errmgr
|
||||
* a chance to setup anything that requires ORTE to actually be ready to go such
|
||||
* as registering callbacks, posting receives, etc.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_post_startup_t)(void);
|
||||
|
||||
/*
|
||||
* This function gets called just before shutdown begins. It gives the errmgr a
|
||||
* chance to clean up anything that it did after startup, i.e. deregistering
|
||||
* callbacks, cleaning up receives, etc.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_pre_shutdown_t)(void);
|
||||
|
||||
/**
|
||||
* Function to mark a list of processes as dead and perform any internal cleanup
|
||||
* necessary.
|
||||
*
|
||||
* @param[in] dead_procs Process list that is being marked as dead.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully.
|
||||
* @retval ORTE_ERROR An unspecified error occurred.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_mark_processes_as_dead_t)(opal_pointer_array_t *dead_procs);
|
||||
|
||||
/**
|
||||
* Set the callback function for faults.
|
||||
*
|
||||
* @param[in] cbfunc The callback function.
|
||||
*
|
||||
* @retval The previous fault callback function.
|
||||
*/
|
||||
typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
|
||||
|
||||
/**
|
||||
* Receive updates about failure notifications.
|
||||
*
|
||||
* @param[in] sender The process who originally sent the failure notification.
|
||||
* @param[in] buffer The buffer containing all the information about the failed process.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully.
|
||||
* @retval ORTE_ERROR An unspecified error occurred.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_failure_notification_t)(orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer);
|
||||
|
||||
/*
|
||||
* Module Structure
|
||||
*/
|
||||
struct orte_errmgr_base_module_2_3_0_t {
|
||||
/** Initialization Function */
|
||||
orte_errmgr_base_module_init_fn_t init;
|
||||
orte_errmgr_base_module_init_fn_t init;
|
||||
/** Finalization Function */
|
||||
orte_errmgr_base_module_finalize_fn_t finalize;
|
||||
orte_errmgr_base_module_finalize_fn_t finalize;
|
||||
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||
|
||||
/** Actual process failure notification */
|
||||
orte_errmgr_base_module_update_state_fn_t update_state;
|
||||
orte_errmgr_base_module_update_state_fn_t update_state;
|
||||
/** Predicted process/node failure notification */
|
||||
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
||||
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
|
||||
/** Suggest a node to map a restarting process onto */
|
||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
||||
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
|
||||
|
||||
/** Handle any FT Notifications */
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
|
||||
/* Register to be warned of impending migration */
|
||||
/* Register to be warned of impending migration */
|
||||
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
|
||||
|
||||
/** Perform post-statup operations */
|
||||
orte_errmgr_base_module_post_startup_t post_startup;
|
||||
|
||||
/** Perform pre-shutdown operations */
|
||||
orte_errmgr_base_module_pre_shutdown_t pre_shutdown;
|
||||
|
||||
/* Mark a process as dead. */
|
||||
orte_errmgr_base_module_mark_processes_as_dead_t mark_processes_as_dead;
|
||||
|
||||
/* Set the callback function */
|
||||
orte_errmgr_base_module_set_fault_callback_t set_fault_callback;
|
||||
|
||||
/* Receive failure notification */
|
||||
orte_errmgr_base_module_failure_notification_t failure_notification;
|
||||
};
|
||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -57,10 +60,6 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
|
||||
orte_vpid_t vpid,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/***************************
|
||||
* Module functions: Global
|
||||
@ -81,6 +80,10 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnp_global_ft_event(int state);
|
||||
int orte_errmgr_hnp_global_post_startup(void);
|
||||
int orte_errmgr_hnp_global_pre_shutdown(void);
|
||||
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
|
||||
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
|
||||
|
||||
/* HNP Versions */
|
||||
int orte_errmgr_hnp_base_global_init(void);
|
||||
|
@ -2,6 +2,9 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -391,6 +394,7 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *node = NULL;
|
||||
bool found = false;
|
||||
int num_removed = 0, num_to_remove;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
if( NULL == current_global_jobdata ) {
|
||||
return ORTE_SUCCESS;
|
||||
@ -410,8 +414,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
item = opal_list_get_next(item) ) {
|
||||
wp_item = (errmgr_autor_wp_item_t*)item;
|
||||
|
||||
if( wp_item->name.vpid == proc->name.vpid &&
|
||||
wp_item->name.jobid == proc->name.jobid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
@ -518,6 +522,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
|
||||
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
|
||||
wp_item->name.jobid = proc->jobid;
|
||||
wp_item->name.vpid = proc->vpid;
|
||||
wp_item->name.epoch = proc->epoch;
|
||||
wp_item->state = state;
|
||||
|
||||
opal_list_append(procs_pending_recovery, &(wp_item->super));
|
||||
@ -621,6 +626,7 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
|
||||
{
|
||||
wp->name.jobid = ORTE_JOBID_INVALID;
|
||||
wp->name.vpid = ORTE_VPID_INVALID;
|
||||
wp->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
wp->state = 0;
|
||||
}
|
||||
@ -629,6 +635,7 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
|
||||
{
|
||||
wp->name.jobid = ORTE_JOBID_INVALID;
|
||||
wp->name.vpid = ORTE_VPID_INVALID;
|
||||
wp->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
wp->state = 0;
|
||||
}
|
||||
|
@ -2,6 +2,9 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -747,6 +750,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
close_iof_stdin = true;
|
||||
iof_name.jobid = proc->name.jobid;
|
||||
iof_name.vpid = proc->name.vpid;
|
||||
iof_name.epoch = proc->name.epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -803,6 +807,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
close_iof_stdin = true;
|
||||
iof_name.jobid = proc->name.jobid;
|
||||
iof_name.vpid = proc->name.vpid;
|
||||
iof_name.epoch = proc->name.epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -850,6 +855,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
close_iof_stdin = true;
|
||||
iof_name.jobid = proc->name.jobid;
|
||||
iof_name.vpid = proc->name.vpid;
|
||||
iof_name.epoch = proc->name.epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,9 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,9 +32,11 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
@ -53,8 +58,9 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
|
||||
static void update_local_children(orte_odls_job_t *jobdat,
|
||||
orte_job_state_t jobstate,
|
||||
orte_proc_state_t state);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
|
||||
static int record_dead_process(orte_process_name_t *proc);
|
||||
static int send_to_local_applications(opal_pointer_array_t *dead_names);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
@ -79,7 +85,11 @@ static int suggest_map_targets(orte_proc_t *proc,
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
static int post_startup(void);
|
||||
static int pre_shutdown(void);
|
||||
|
||||
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
|
||||
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
|
||||
|
||||
/******************
|
||||
* ORTED module
|
||||
@ -94,7 +104,12 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
post_startup,
|
||||
pre_shutdown,
|
||||
mark_processes_as_dead,
|
||||
orte_errmgr_base_set_fault_callback, /* Set callback function */
|
||||
failure_notification
|
||||
};
|
||||
|
||||
/************************
|
||||
@ -125,20 +140,29 @@ static int update_state(orte_jobid_t job,
|
||||
int rc=ORTE_SUCCESS;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
orte_app_context_t *app;
|
||||
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:orted:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
((NULL == proc) ? "App. Process" :
|
||||
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
|
||||
|
||||
/* if this is a heartbeat failure, let the HNP handle it */
|
||||
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
|
||||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*** UPDATE COMMAND FOR A JOB ***/
|
||||
if (NULL == proc) {
|
||||
/* this is an update for an entire job */
|
||||
@ -175,7 +199,7 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jobdat->jobid == job) {
|
||||
break;
|
||||
@ -184,7 +208,7 @@ static int update_state(orte_jobid_t job,
|
||||
if (NULL == jobdat) {
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
|
||||
switch (jobstate) {
|
||||
case ORTE_JOB_STATE_FAILED_TO_START:
|
||||
failed_start(jobdat, exit_code);
|
||||
@ -197,10 +221,10 @@ static int update_state(orte_jobid_t job,
|
||||
/* update all procs in job */
|
||||
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
|
||||
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
case ORTE_JOB_STATE_COMM_FAILED:
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
/* tell the caller we can't recover */
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
break;
|
||||
@ -237,15 +261,16 @@ static int update_state(orte_jobid_t job,
|
||||
* lifeline
|
||||
*/
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* if it is our own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid &&
|
||||
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* see if this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
/* kill our children */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
/* terminate - our routed children will see
|
||||
* us leave and automatically die
|
||||
*/
|
||||
@ -256,21 +281,25 @@ static int update_state(orte_jobid_t job,
|
||||
/* was it a daemon that failed? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if all my routes are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
if (0 == orte_routed.num_routes() &&
|
||||
0 == opal_list_get_size(&orte_local_children)) {
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
|
||||
record_dead_process(proc);
|
||||
|
||||
/* if not, then indicate we can continue */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* lookup the local jobdat for this job */
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jobdat->jobid == proc->jobid) {
|
||||
break;
|
||||
@ -280,7 +309,7 @@ static int update_state(orte_jobid_t job,
|
||||
/* must already be complete */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* if there are no local procs for this job, we can
|
||||
* ignore this call
|
||||
*/
|
||||
@ -301,15 +330,15 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
|
||||
child->state = state;
|
||||
child->exit_code = exit_code;
|
||||
/* Decrement the number of local procs */
|
||||
jobdat->num_local_procs--;
|
||||
/* kill this proc */
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
killprocs(proc->jobid, proc->vpid, proc->epoch);
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
|
||||
@ -324,7 +353,7 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (ORTE_PROC_STATE_TERMINATED < state) {
|
||||
if( jobdat->enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
@ -335,8 +364,8 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
app = jobdat->apps[child->app_idx];
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
@ -363,8 +392,8 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REPORT_ABORT:
|
||||
|
||||
REPORT_ABORT:
|
||||
/* if the job hasn't completed and the state is abnormally
|
||||
* terminated, then we need to alert the HNP right away
|
||||
*/
|
||||
@ -387,8 +416,8 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
|
||||
child->state = state;
|
||||
child->exit_code = exit_code;
|
||||
@ -402,7 +431,7 @@ static int update_state(orte_jobid_t job,
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
/* Decrement the number of local procs */
|
||||
jobdat->num_local_procs--;
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -424,15 +453,15 @@ static int update_state(orte_jobid_t job,
|
||||
OBJ_DESTRUCT(&alert);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* find this proc in the local children so we can update its state */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
|
||||
child->state = state;
|
||||
if (0 < pid) {
|
||||
@ -452,7 +481,7 @@ static int update_state(orte_jobid_t job,
|
||||
* the HNP so it is available to debuggers and anyone
|
||||
* else that needs it
|
||||
*/
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted: sending contact info to HNP",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -469,7 +498,7 @@ static int update_state(orte_jobid_t job,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto FINAL_CLEANUP;
|
||||
}
|
||||
/* pack all the local child vpids */
|
||||
/* pack all the local child vpids and epochs */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -479,6 +508,10 @@ static int update_state(orte_jobid_t job,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto FINAL_CLEANUP;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->epoch, 1, ORTE_EPOCH))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto FINAL_CLEANUP;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* pack an invalid marker */
|
||||
@ -502,7 +535,7 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* only other state is terminated - see if anyone is left alive */
|
||||
if (!any_live_children(proc->jobid)) {
|
||||
/* lookup the local jobdat for this job */
|
||||
@ -511,7 +544,7 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jobdat->jobid == proc->jobid) {
|
||||
break;
|
||||
@ -533,8 +566,8 @@ static int update_state(orte_jobid_t job,
|
||||
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
FINAL_CLEANUP:
|
||||
|
||||
FINAL_CLEANUP:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted reporting all procs in %s terminated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -548,7 +581,7 @@ static int update_state(orte_jobid_t job,
|
||||
item = next) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
next = opal_list_get_next(item);
|
||||
|
||||
|
||||
if (jobdat->jobid == child->name->jobid) {
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
OBJ_RELEASE(child);
|
||||
@ -557,11 +590,11 @@ static int update_state(orte_jobid_t job,
|
||||
|
||||
/* ensure the job's local session directory tree is removed */
|
||||
orte_session_dir_cleanup(jobdat->jobid);
|
||||
|
||||
|
||||
/* remove this job from our local job data since it is complete */
|
||||
opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
|
||||
OBJ_RELEASE(jobdat);
|
||||
|
||||
|
||||
/* send it */
|
||||
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -569,6 +602,7 @@ static int update_state(orte_jobid_t job,
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
OBJ_DESTRUCT(&alert);
|
||||
|
||||
/* indicate that the job is complete */
|
||||
return rc;
|
||||
}
|
||||
@ -594,6 +628,131 @@ int ft_event(int state)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int post_startup(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int pre_shutdown(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
|
||||
int i;
|
||||
orte_process_name_t *name_item;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"ORTED %s marking procs as dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
|
||||
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
|
||||
opal_output(0, "NULL found in dead process list.");
|
||||
continue;
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"ORTED %s marking %s as dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name_item)));
|
||||
}
|
||||
|
||||
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Increment the epoch */
|
||||
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
|
||||
orte_util_set_epoch(name_item, name_item->epoch + 1);
|
||||
|
||||
/* Remove the dead process from my list of children if applicable */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t *) item;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID,
|
||||
child->name, name_item)) {
|
||||
opal_list_remove_item(&orte_local_children, item);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the route from the routing layer */
|
||||
orte_routed.delete_route(name_item);
|
||||
}
|
||||
|
||||
/* Update the routing module */
|
||||
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
if (NULL != fault_cbfunc) {
|
||||
(*fault_cbfunc)(dead_procs);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
|
||||
opal_pointer_array_t *dead_names;
|
||||
orte_std_cntr_t n;
|
||||
int ret = ORTE_SUCCESS, num_failed;
|
||||
int32_t i;
|
||||
orte_process_name_t *name_item, proc;
|
||||
|
||||
dead_names = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
n = 1;
|
||||
/* Get the number of failed procs */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
/* Unpack the buffer to get the dead process' name. */
|
||||
n = 1;
|
||||
|
||||
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name_item),
|
||||
ORTE_NAME_PRINT(sender));
|
||||
}
|
||||
|
||||
/* There shouldn't be an issue of receiving this message multiple
|
||||
* times but it doesn't hurt to double check.
|
||||
*/
|
||||
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
|
||||
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_pointer_array_add(dead_names, name_item);
|
||||
}
|
||||
|
||||
/* Tell the errmgr so it can handle changing the epoch, routes, etc. */
|
||||
orte_errmgr.mark_processes_as_dead(dead_names);
|
||||
|
||||
/* Tell the applications' ORTE layers that there is a failure. */
|
||||
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
|
||||
free(name_item);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
@ -601,14 +760,14 @@ static bool any_live_children(orte_jobid_t job)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
|
||||
/* the thread is locked elsewhere - don't try to do it again here */
|
||||
|
||||
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
|
||||
/* is this child part of the specified job? */
|
||||
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
|
||||
child->alive) {
|
||||
@ -618,13 +777,13 @@ static bool any_live_children(orte_jobid_t job)
|
||||
|
||||
/* if we get here, then nobody is left alive from that job */
|
||||
return false;
|
||||
|
||||
|
||||
}
|
||||
|
||||
static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
||||
/* pack the child's vpid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -659,70 +818,70 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
|
||||
{
|
||||
int rc;
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_child_t *child;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we are timing things, pack the time the launch msg for this job was recvd */
|
||||
if (orte_timing) {
|
||||
int64_t tmp;
|
||||
tmp = jobdat->launch_msg_recvd.tv_sec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
tmp = jobdat->launch_msg_recvd.tv_usec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = next) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
next = opal_list_get_next(item);
|
||||
/* if this child is part of the job... */
|
||||
if (child->name->jobid == jobdat->jobid) {
|
||||
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* flag that this job is complete so the receiver can know */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
|
||||
{
|
||||
int rc;
|
||||
opal_list_item_t *item, *next;
|
||||
orte_odls_child_t *child;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
|
||||
/* pack the jobid */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* if we are timing things, pack the time the launch msg for this job was recvd */
|
||||
if (orte_timing) {
|
||||
int64_t tmp;
|
||||
tmp = jobdat->launch_msg_recvd.tv_sec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
tmp = jobdat->launch_msg_recvd.tv_usec;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = next) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
next = opal_list_get_next(item);
|
||||
/* if this child is part of the job... */
|
||||
if (child->name->jobid == jobdat->jobid) {
|
||||
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* flag that this job is complete so the receiver can know */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static bool all_children_registered(orte_jobid_t job)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
|
||||
/* the thread is locked elsewhere - don't try to do it again here */
|
||||
|
||||
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
|
||||
/* is this child part of the specified job? */
|
||||
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
|
||||
/* if this child has terminated, we consider it as having
|
||||
@ -748,10 +907,10 @@ static bool all_children_registered(orte_jobid_t job)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* if we get here, then everyone in the job is currently registered */
|
||||
return true;
|
||||
|
||||
|
||||
}
|
||||
|
||||
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
@ -759,14 +918,14 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
int rc;
|
||||
|
||||
|
||||
/* the thread is locked elsewhere - don't try to do it again here */
|
||||
|
||||
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
|
||||
/* is this child part of the specified job? */
|
||||
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
|
||||
/* pack the child's vpid - must be done in case rml_uri is NULL */
|
||||
@ -774,6 +933,11 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* Pack the child's epoch. */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* pack the contact info */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -781,19 +945,19 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
|
||||
/* set the state */
|
||||
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
|
||||
|
||||
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -822,7 +986,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
|
||||
/* update job state */
|
||||
jobdat->state = jobstate;
|
||||
/* update children */
|
||||
@ -836,28 +1000,29 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
|
||||
}
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
|
||||
{
|
||||
opal_pointer_array_t cmd;
|
||||
orte_proc_t proc;
|
||||
int rc;
|
||||
|
||||
|
||||
/* stop local sensors for this job */
|
||||
if (ORTE_VPID_WILDCARD == vpid) {
|
||||
orte_sensor.stop(job);
|
||||
}
|
||||
|
||||
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
|
||||
|
||||
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = job;
|
||||
proc.name.vpid = vpid;
|
||||
proc.name.epoch = epoch;
|
||||
opal_pointer_array_add(&cmd, &proc);
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -865,3 +1030,85 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_DESTRUCT(&proc);
|
||||
}
|
||||
|
||||
static int record_dead_process(orte_process_name_t *proc) {
|
||||
opal_pointer_array_t *dead_name;
|
||||
opal_buffer_t *buffer;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
int rc = ORTE_SUCCESS;
|
||||
int num_failed;
|
||||
|
||||
if (orte_odls_base_default_check_finished(proc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
dead_name = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
opal_pointer_array_add(dead_name, proc);
|
||||
|
||||
/* Mark the process as dead */
|
||||
mark_processes_as_dead(dead_name);
|
||||
|
||||
/* Send a message to the HNP */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
command = ORTE_PROCESS_FAILED_NOTIFICATION;
|
||||
|
||||
num_failed = 1;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
|
||||
|
||||
OBJ_RELEASE(buffer);
|
||||
OBJ_RELEASE(dead_name);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int send_to_local_applications(opal_pointer_array_t *dead_names) {
|
||||
opal_buffer_t *buf;
|
||||
int ret;
|
||||
orte_process_name_t *name_item;
|
||||
int size, i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s Sending failure to local applications.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
size = opal_pointer_array_get_size(dead_names);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(buf);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -61,6 +61,7 @@ orte_ess_base_module_t orte_ess_alps_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch,
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -264,10 +265,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
|
||||
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
{
|
||||
orte_pmap_t *pmap;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* is this me? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
/* yes it is - reply with my rank. This is necessary
|
||||
* because the pidmap will not have arrived when I
|
||||
* am starting up, and if we use static ports, then
|
||||
@ -348,6 +351,7 @@ static int alps_set_name(void)
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid;
|
||||
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -363,5 +367,9 @@ static int alps_set_name(void)
|
||||
|
||||
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -86,6 +86,8 @@ ORTE_DECLSPEC int orte_ess_env_put(orte_std_cntr_t num_procs,
|
||||
orte_std_cntr_t num_local_procs,
|
||||
char ***env);
|
||||
|
||||
ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc);
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -46,6 +46,10 @@ int orte_ess_env_get(void)
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
|
||||
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -24,10 +24,8 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
@ -38,10 +36,19 @@
|
||||
|
||||
opal_list_t orte_ess_base_components_available;
|
||||
orte_ess_base_module_t orte_ess = {
|
||||
NULL, /* init */
|
||||
NULL, /* finalize */
|
||||
NULL, /* abort */
|
||||
NULL /* ft_event */
|
||||
NULL, /* init */
|
||||
NULL, /* finalize */
|
||||
NULL, /* abort */
|
||||
NULL, /* proc_get_locality */
|
||||
NULL, /* proc_get_daemon */
|
||||
NULL, /* proc_get_hostname */
|
||||
NULL, /* get_local_rank */
|
||||
NULL, /* get_node_rank */
|
||||
NULL, /* proc_get_epoch */
|
||||
NULL, /* update_pidmap */
|
||||
NULL, /* update_nidmap */
|
||||
NULL, /* query_sys_info */
|
||||
NULL /* ft_event */
|
||||
};
|
||||
int orte_ess_base_output;
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -24,10 +24,33 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_component_repository.h"
|
||||
|
||||
#include "orte/util/nidmap.h"
|
||||
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
|
||||
extern opal_list_t orte_ess_base_components_available;
|
||||
|
||||
/**
|
||||
* Generic function to retrieve the epoch of a specific process
|
||||
* from the job data.
|
||||
*/
|
||||
orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc)
|
||||
{
|
||||
orte_epoch_t epoch;
|
||||
|
||||
if (ORTE_EPOCH_INVALID == (epoch = orte_util_lookup_epoch(proc))) {
|
||||
return ORTE_NODE_RANK_INVALID;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
|
||||
"%s ess:generic: proc %s has epoch %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
epoch));
|
||||
|
||||
return epoch;
|
||||
}
|
||||
|
||||
int
|
||||
orte_ess_base_select(void)
|
||||
{
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -78,7 +78,7 @@ int orte_ess_base_app_setup(void)
|
||||
error = "orte_errmgr_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
||||
/* Setup the communication infrastructure */
|
||||
|
||||
/* Runtime Messaging Layer */
|
||||
@ -92,6 +92,7 @@ int orte_ess_base_app_setup(void)
|
||||
error = "orte_rml_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Routed system */
|
||||
if (ORTE_SUCCESS != (ret = orte_routed_base_open())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
@ -238,6 +239,13 @@ int orte_ess_base_app_setup(void)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Execute the post-startup errmgr code */
|
||||
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_errmgr.post_startup";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* if we are an ORTE app - and not an MPI app - then
|
||||
* we need to barrier here. MPI_Init has its own barrier,
|
||||
* so we don't need to do two of them. However, if we
|
||||
@ -270,6 +278,8 @@ error:
|
||||
|
||||
int orte_ess_base_app_finalize(void)
|
||||
{
|
||||
orte_errmgr.pre_shutdown();
|
||||
|
||||
orte_notifier_base_close();
|
||||
|
||||
orte_cr_finalize();
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -327,7 +327,7 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree!
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree())) {
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "failed to update routing tree";
|
||||
goto error;
|
||||
@ -514,6 +514,13 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
/* start the local sensors */
|
||||
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
/* Execute the post-startup errmgr code */
|
||||
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_errmgr.post_startup";
|
||||
goto error;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
@ -526,6 +533,8 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
|
||||
int orte_ess_base_orted_finalize(void)
|
||||
{
|
||||
orte_errmgr.pre_shutdown();
|
||||
|
||||
/* stop the local sensors */
|
||||
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -58,6 +58,7 @@ orte_ess_base_module_t orte_ess_cnos_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* get_epoch */
|
||||
NULL, /* add_pidmap is only used in ORTE */
|
||||
NULL, /* update_nidmap is only used in ORTE */
|
||||
query_sys_info,
|
||||
@ -87,6 +88,10 @@ static int rte_init(void)
|
||||
|
||||
/* Get the number of procs in the job from cnos */
|
||||
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* Get the nid map */
|
||||
nprocs = cnos_get_nidpid_map(&map);
|
||||
|
12
orte/mca/ess/env/ess_env_module.c
поставляемый
12
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -99,6 +99,7 @@ orte_ess_base_module_t orte_ess_env_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -305,10 +306,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
|
||||
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
{
|
||||
orte_pmap_t *pmap;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
|
||||
/* is this me? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
/* yes it is - reply with my rank. This is necessary
|
||||
* because the pidmap will not have arrived when I
|
||||
* am starting up, and if we use static ports, then
|
||||
@ -386,9 +389,10 @@ static int env_set_name(void)
|
||||
return(rc);
|
||||
}
|
||||
free(tmp);
|
||||
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:env set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2010 High Performance Computing Center Stuttgart,
|
||||
@ -104,6 +104,15 @@ typedef orte_local_rank_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_
|
||||
*/
|
||||
typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc);
|
||||
|
||||
/**
|
||||
* Update the epoch
|
||||
*
|
||||
* The epochs of the processes are stored in the process_name struct, but this
|
||||
* will get the most up to date version stored within the orte_proc_t struct.
|
||||
* Obviously the epoch of the proc that is passed in will be ignored.
|
||||
*/
|
||||
typedef orte_epoch_t (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc);
|
||||
|
||||
/**
|
||||
* Update the pidmap
|
||||
*
|
||||
@ -154,6 +163,7 @@ struct orte_ess_base_module_1_0_0_t {
|
||||
orte_ess_base_module_proc_get_hostname_fn_t proc_get_hostname;
|
||||
orte_ess_base_module_proc_get_local_rank_fn_t get_local_rank;
|
||||
orte_ess_base_module_proc_get_node_rank_fn_t get_node_rank;
|
||||
orte_ess_base_module_proc_get_epoch_fn_t proc_get_epoch;
|
||||
orte_ess_base_module_update_pidmap_fn_t update_pidmap;
|
||||
orte_ess_base_module_update_nidmap_fn_t update_nidmap;
|
||||
orte_ess_base_module_query_sys_info_t query_sys_info;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -95,6 +95,7 @@ orte_ess_base_module_t orte_ess_generic_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch,
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -154,6 +155,7 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
|
||||
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s completed name definition",
|
||||
@ -165,6 +167,10 @@ static int rte_init(void)
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = strtol(envar, NULL, 10);
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* set the app_num so that MPI attributes get set correctly */
|
||||
orte_process_info.app_num = 1;
|
||||
@ -267,6 +273,7 @@ static int rte_init(void)
|
||||
if (vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
ORTE_PROC_MY_DAEMON->jobid = 0;
|
||||
ORTE_PROC_MY_DAEMON->vpid = i;
|
||||
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s node %d name %s rank %s",
|
||||
@ -297,6 +304,7 @@ static int rte_init(void)
|
||||
if (vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
ORTE_PROC_MY_DAEMON->jobid = 0;
|
||||
ORTE_PROC_MY_DAEMON->vpid = i;
|
||||
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s node %d name %s rank %d",
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -75,6 +75,7 @@
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/comm/comm.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -108,6 +109,7 @@ orte_ess_base_module_t orte_ess_hnp_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -490,6 +492,8 @@ static int rte_init(void)
|
||||
proc = OBJ_NEW(orte_proc_t);
|
||||
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
proc->name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
proc->pid = orte_process_info.pid;
|
||||
proc->rml_uri = orte_rml.get_contact_info();
|
||||
proc->state = ORTE_PROC_STATE_RUNNING;
|
||||
@ -820,6 +824,7 @@ static uint8_t proc_get_locality(orte_process_name_t *proc)
|
||||
orte_node_t *node;
|
||||
orte_proc_t *myproc;
|
||||
int i;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* the HNP is always on node=0 of the node array */
|
||||
node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||
@ -829,8 +834,10 @@ static uint8_t proc_get_locality(orte_process_name_t *proc)
|
||||
if (NULL == (myproc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (myproc->name.jobid == proc->jobid &&
|
||||
myproc->name.vpid == proc->vpid) {
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &myproc->name, proc)) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
|
||||
"%s ess:hnp: proc %s is LOCAL",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -877,7 +884,7 @@ static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
||||
if( NULL == pdata->node->daemon ) {
|
||||
if( NULL == pdata->node || NULL == pdata->node->daemon ) {
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -67,6 +67,7 @@ orte_ess_base_module_t orte_ess_lsf_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -271,10 +272,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
|
||||
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
{
|
||||
orte_pmap_t *pmap;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
|
||||
/* is this me? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
/* yes it is - reply with my rank. This is necessary
|
||||
* because the pidmap will not have arrived when I
|
||||
* am starting up, and if we use static ports, then
|
||||
@ -354,6 +357,7 @@ static int lsf_set_name(void)
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
|
||||
|
||||
/* fix up the base name and make it the "real" name */
|
||||
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -56,6 +56,7 @@ orte_ess_base_module_t orte_ess_portals4_shmem_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
NULL, /* add_pidmap is only used in ORTE */
|
||||
NULL, /* update_nidmap is only used in ORTE */
|
||||
query_sys_info,
|
||||
@ -85,6 +86,10 @@ static int rte_init(void)
|
||||
|
||||
/* Get the number of procs in the job from portals4_shmem */
|
||||
orte_process_info.num_procs = (orte_std_cntr_t) runtime_get_size();
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* Get the nid map */
|
||||
nprocs = runtime_get_nidpid_map(&map);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -54,6 +54,7 @@ orte_ess_base_module_t orte_ess_portals_utcp_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
NULL, /* add_pidmap is only used in ORTE */
|
||||
NULL, /* update_nidmap is only used in ORTE */
|
||||
query_sys_info,
|
||||
@ -91,6 +92,7 @@ static int rte_init(void)
|
||||
return(rc);
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->epoch = ORTE_MIN_EPOCH;
|
||||
|
||||
/*
|
||||
* Get the number of procs in the job. We assume vpids start at 0. We
|
||||
@ -102,6 +104,10 @@ static int rte_init(void)
|
||||
nidmap = opal_argv_split(nidmap_string, ':');
|
||||
orte_process_info.num_procs = (orte_std_cntr_t) opal_argv_count(nidmap);
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* MPI_Init needs the grpcomm framework, so we have to init it */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_open())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -88,6 +88,7 @@ orte_ess_base_module_t orte_ess_singleton_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -187,6 +188,7 @@ static int rte_init(void)
|
||||
/* set the name */
|
||||
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
|
||||
ORTE_PROC_MY_NAME->vpid = 0;
|
||||
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
} else {
|
||||
/*
|
||||
@ -231,6 +233,10 @@ static int rte_init(void)
|
||||
}
|
||||
|
||||
orte_process_info.num_procs = 1;
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* NOTE: do not wireup our io - let the fork'd orted serve
|
||||
* as our io handler. This prevents issues with the event
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -98,6 +98,7 @@ orte_ess_base_module_t orte_ess_slave_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -183,9 +184,12 @@ static uint8_t proc_get_locality(orte_process_name_t *proc)
|
||||
|
||||
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
|
||||
/* if it is me, the answer is my daemon's vpid */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
return ORTE_PROC_MY_DAEMON->vpid;
|
||||
}
|
||||
|
||||
@ -195,9 +199,11 @@ static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
|
||||
|
||||
static char* proc_get_hostname(orte_process_name_t *proc)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
/* if it is me, the answer is my nodename */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
return orte_process_info.nodename;
|
||||
}
|
||||
|
||||
@ -207,9 +213,11 @@ static char* proc_get_hostname(orte_process_name_t *proc)
|
||||
|
||||
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
/* if it is me, the local rank is zero */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -272,6 +280,7 @@ static int slave_set_name(void)
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -72,6 +72,7 @@ orte_ess_base_module_t orte_ess_slurm_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -275,10 +276,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
|
||||
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
{
|
||||
orte_pmap_t *pmap;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* is this me? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
/* yes it is - reply with my rank. This is necessary
|
||||
* because the pidmap will not have arrived when I
|
||||
* am starting up, and if we use static ports, then
|
||||
@ -367,8 +370,8 @@ static int slurm_set_name(void)
|
||||
/* fix up the vpid and make it the "real" vpid */
|
||||
slurm_nodeid = atoi(getenv("SLURM_NODEID"));
|
||||
ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid;
|
||||
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -81,6 +81,7 @@ orte_ess_base_module_t orte_ess_slurmd_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -189,7 +190,7 @@ static int rte_init(void)
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
|
||||
#endif
|
||||
|
||||
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
|
||||
/* get our local rank */
|
||||
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
|
||||
error = "could not get SLURM_LOCALID";
|
||||
@ -216,11 +217,15 @@ static int rte_init(void)
|
||||
orte_process_info.num_procs = strtol(envar, NULL, 10);
|
||||
#endif
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
#if WANT_SLURM_PMI_SUPPORT
|
||||
if (PMI_SUCCESS != PMI_Get_appnum(&i)) {
|
||||
error = "PMI_Get_appnum failed";
|
||||
goto error;
|
||||
}
|
||||
|
||||
orte_process_info.app_num = i;
|
||||
#else
|
||||
/* set the app_num so that MPI attributes get set correctly */
|
||||
@ -250,6 +255,7 @@ static int rte_init(void)
|
||||
nodeid = strtol(envar, NULL, 10);
|
||||
ORTE_PROC_MY_DAEMON->jobid = 0;
|
||||
ORTE_PROC_MY_DAEMON->vpid = nodeid;
|
||||
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
|
||||
/* get the number of ppn */
|
||||
if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) {
|
||||
@ -338,6 +344,7 @@ static int rte_init(void)
|
||||
opal_pointer_array_add(&orte_jobmap, jmap);
|
||||
/* update the num procs */
|
||||
jmap->num_procs = orte_process_info.num_procs;
|
||||
|
||||
/* set the size of the pidmap storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -70,6 +70,7 @@ orte_ess_base_module_t orte_ess_tm_module = {
|
||||
proc_get_hostname,
|
||||
proc_get_local_rank,
|
||||
proc_get_node_rank,
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
update_pidmap,
|
||||
update_nidmap,
|
||||
orte_ess_base_query_sys_info,
|
||||
@ -273,10 +274,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
|
||||
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
|
||||
{
|
||||
orte_pmap_t *pmap;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* is this me? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
|
||||
/* yes it is - reply with my rank. This is necessary
|
||||
* because the pidmap will not have arrived when I
|
||||
* am starting up, and if we use static ports, then
|
||||
@ -361,6 +364,7 @@ static int tm_set_name(void)
|
||||
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
ORTE_PROC_MY_NAME->vpid = vpid;
|
||||
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"ess:tm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -57,6 +57,7 @@ orte_ess_base_module_t orte_ess_tool_module = {
|
||||
NULL, /* don't need a proc_get_hostname fn */
|
||||
NULL, /* don't need a proc_get_local_rank fn */
|
||||
NULL, /* don't need a proc_get_node_rank fn */
|
||||
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
|
||||
NULL, /* don't need to update_pidmap */
|
||||
NULL, /* don't need to update_nidmap */
|
||||
query_sys_info,
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
@ -1097,8 +1097,11 @@ static int orte_filem_rsh_start_command(orte_filem_base_process_set_t *proc_set
|
||||
if( NULL != proc_set ) {
|
||||
wp_item->proc_set.source.jobid = proc_set->source.jobid;
|
||||
wp_item->proc_set.source.vpid = proc_set->source.vpid;
|
||||
wp_item->proc_set.source.epoch = proc_set->source.epoch;
|
||||
|
||||
wp_item->proc_set.sink.jobid = proc_set->sink.jobid;
|
||||
wp_item->proc_set.sink.vpid = proc_set->sink.vpid;
|
||||
wp_item->proc_set.sink.epoch = proc_set->sink.epoch;
|
||||
}
|
||||
/* Copy the File Set */
|
||||
if( NULL != file_set ) {
|
||||
@ -1346,6 +1349,7 @@ static void orte_filem_rsh_permission_callback(int status,
|
||||
int num_req, num_allowed = 0;
|
||||
int perm_flag, i;
|
||||
int32_t peer_status = 0;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh: permission_callback(? ?): Peer %s ...",
|
||||
@ -1392,6 +1396,7 @@ static void orte_filem_rsh_permission_callback(int status,
|
||||
wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t);
|
||||
wp_item->proc_set.source.jobid = sender->jobid;
|
||||
wp_item->proc_set.source.vpid = sender->vpid;
|
||||
wp_item->proc_set.source.epoch = sender->epoch;
|
||||
|
||||
opal_list_append(&work_pool_waiting, &(wp_item->super));
|
||||
}
|
||||
@ -1443,8 +1448,10 @@ static void orte_filem_rsh_permission_callback(int status,
|
||||
item != opal_list_get_end( &work_pool_pending);
|
||||
item = opal_list_get_next( item) ) {
|
||||
wp_item = (orte_filem_rsh_work_pool_item_t *)item;
|
||||
if(sender->jobid == wp_item->proc_set.source.jobid &&
|
||||
sender->vpid == wp_item->proc_set.source.vpid ) {
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, sender, &wp_item->proc_set.source)) {
|
||||
opal_list_remove_item( &work_pool_pending, item);
|
||||
break;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -168,6 +168,9 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
|
||||
if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
|
||||
/* I send first */
|
||||
peer.vpid = vpids[1];
|
||||
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
|
||||
/* setup a temp buffer so I can inform the other side as to the
|
||||
* number of entries in my buffer
|
||||
*/
|
||||
@ -223,6 +226,9 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
|
||||
opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
|
||||
opal_dss.copy_payload(&buf, sendbuf);
|
||||
peer.vpid = vpids[0];
|
||||
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:coll:two-proc sending to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -314,6 +320,9 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
|
||||
/* first send my current contents */
|
||||
nv = (rank - distance + np) % np;
|
||||
peer.vpid = vpids[nv];
|
||||
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
|
||||
opal_dss.copy_payload(&buf, &collection);
|
||||
@ -331,6 +340,9 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
|
||||
num_recvd = 0;
|
||||
nv = (rank + distance) % np;
|
||||
peer.vpid = vpids[nv];
|
||||
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
|
||||
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer,
|
||||
ORTE_RML_TAG_DAEMON_COLLECTIVE,
|
||||
@ -427,6 +439,9 @@ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int
|
||||
/* first send my current contents */
|
||||
nv = rank ^ distance;
|
||||
peer.vpid = vpids[nv];
|
||||
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
|
||||
opal_dss.copy_payload(&buf, &collection);
|
||||
@ -631,6 +646,8 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
|
||||
proc.jobid = jobid;
|
||||
proc.vpid = 0;
|
||||
while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) {
|
||||
proc.epoch = orte_ess.proc_get_epoch(&proc);
|
||||
|
||||
/* get the daemon that hosts this proc */
|
||||
daemonvpid = orte_ess.proc_get_daemon(&proc);
|
||||
/* is this daemon one of our children, or at least its contribution
|
||||
@ -695,6 +712,8 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
|
||||
/* send it */
|
||||
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
my_parent.vpid = orte_routed.get_routing_tree(NULL);
|
||||
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
||||
"%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -41,6 +41,8 @@
|
||||
#include "orte/orted/orted.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "opal/mca/paffinity/paffinity.h"
|
||||
|
||||
#include "orte/mca/grpcomm/base/base.h"
|
||||
#include "grpcomm_hier.h"
|
||||
|
||||
@ -93,6 +95,7 @@ static int init(void)
|
||||
|
||||
my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID;
|
||||
my_local_rank_zero_proc.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -267,6 +270,8 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
for (v=0; v < orte_process_info.num_procs; v++) {
|
||||
proc.vpid = v;
|
||||
proc.epoch = orte_util_lookup_epoch(&proc);
|
||||
|
||||
/* is this proc local_rank=0 on its node? */
|
||||
if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) {
|
||||
my_coll_peers[cpeers++] = v;
|
||||
@ -280,12 +285,15 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
||||
nm = OBJ_NEW(orte_namelist_t);
|
||||
nm->name.jobid = proc.jobid;
|
||||
nm->name.vpid = proc.vpid;
|
||||
nm->name.epoch = proc.epoch;
|
||||
|
||||
opal_list_append(&my_local_peers, &nm->item);
|
||||
/* if I am not local_rank=0, is this one? */
|
||||
if (0 != my_local_rank &&
|
||||
0 == orte_ess.get_local_rank(&proc)) {
|
||||
my_local_rank_zero_proc.jobid = proc.jobid;
|
||||
my_local_rank_zero_proc.vpid = proc.vpid;
|
||||
my_local_rank_zero_proc.epoch = proc.epoch;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -135,6 +135,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
ep = OBJ_NEW(orte_iof_sink_t); \
|
||||
ep->name.jobid = (nm)->jobid; \
|
||||
ep->name.vpid = (nm)->vpid; \
|
||||
ep->name.epoch = (nm)->epoch; \
|
||||
ep->tag = (tg); \
|
||||
if (0 <= (fid)) { \
|
||||
ep->wev->fd = (fid); \
|
||||
@ -168,6 +169,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
rev = OBJ_NEW(orte_iof_read_event_t); \
|
||||
rev->name.jobid = (nm)->jobid; \
|
||||
rev->name.vpid = (nm)->vpid; \
|
||||
rev->name.epoch = (nm)->epoch; \
|
||||
rev->tag = (tg); \
|
||||
rev->fd = (fid); \
|
||||
*(rv) = rev; \
|
||||
@ -192,6 +194,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
ep = OBJ_NEW(orte_iof_sink_t); \
|
||||
ep->name.jobid = (nm)->jobid; \
|
||||
ep->name.vpid = (nm)->vpid; \
|
||||
ep->name.epoch = (nm)->epoch; \
|
||||
ep->tag = (tg); \
|
||||
if (0 <= (fid)) { \
|
||||
ep->wev->fd = (fid); \
|
||||
@ -212,6 +215,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
|
||||
rev = OBJ_NEW(orte_iof_read_event_t); \
|
||||
rev->name.jobid = (nm)->jobid; \
|
||||
rev->name.vpid = (nm)->vpid; \
|
||||
rev->name.epoch= (nm)->epoch; \
|
||||
rev->tag = (tg); \
|
||||
*(rv) = rev; \
|
||||
opal_event_set(opal_event_base, \
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -91,6 +91,7 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr)
|
||||
{
|
||||
ptr->daemon.jobid = ORTE_JOBID_INVALID;
|
||||
ptr->daemon.vpid = ORTE_VPID_INVALID;
|
||||
ptr->daemon.epoch = ORTE_EPOCH_INVALID;
|
||||
ptr->wev = OBJ_NEW(orte_iof_write_event_t);
|
||||
}
|
||||
static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr)
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -42,6 +42,7 @@
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
@ -147,6 +148,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
orte_odls_job_t *jobdat=NULL;
|
||||
int np, numdigs;
|
||||
int rc;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* don't do this if the dst vpid is invalid or the fd is negative! */
|
||||
if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) {
|
||||
@ -174,8 +176,8 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
item != opal_list_get_end(&mca_iof_hnp_component.procs);
|
||||
item = opal_list_get_next(item)) {
|
||||
proct = (orte_iof_proc_t*)item;
|
||||
if (proct->name.jobid == dst_name->jobid &&
|
||||
proct->name.vpid == dst_name->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
|
||||
/* found it */
|
||||
goto SETUP;
|
||||
}
|
||||
@ -184,6 +186,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
proct = OBJ_NEW(orte_iof_proc_t);
|
||||
proct->name.jobid = dst_name->jobid;
|
||||
proct->name.vpid = dst_name->vpid;
|
||||
proct->name.epoch = dst_name->epoch;
|
||||
opal_list_append(&mca_iof_hnp_component.procs, &proct->super);
|
||||
/* see if we are to output to a file */
|
||||
if (NULL != orte_output_filename) {
|
||||
@ -278,6 +281,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
|
||||
&mca_iof_hnp_component.sinks);
|
||||
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sink->daemon.vpid = proc->node->daemon->name.vpid;
|
||||
sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon);
|
||||
}
|
||||
}
|
||||
|
||||
@ -384,6 +388,7 @@ static int hnp_pull(const orte_process_name_t* dst_name,
|
||||
&mca_iof_hnp_component.sinks);
|
||||
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -397,15 +402,17 @@ static int hnp_close(const orte_process_name_t* peer,
|
||||
{
|
||||
opal_list_item_t *item, *next_item;
|
||||
orte_iof_sink_t* sink;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(&mca_iof_hnp_component.sinks);
|
||||
item != opal_list_get_end(&mca_iof_hnp_component.sinks);
|
||||
item = next_item ) {
|
||||
sink = (orte_iof_sink_t*)item;
|
||||
next_item = opal_list_get_next(item);
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if((sink->name.jobid == peer->jobid) &&
|
||||
(sink->name.vpid == peer->vpid) &&
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
|
||||
(source_tag & sink->tag)) {
|
||||
|
||||
/* No need to delete the event or close the file
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -89,6 +89,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
opal_list_item_t *item, *prev_item;
|
||||
orte_iof_proc_t *proct;
|
||||
int rc;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock);
|
||||
|
||||
@ -146,9 +147,10 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
continue;
|
||||
}
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* if the daemon is me, then this is a local sink */
|
||||
if (ORTE_PROC_MY_NAME->jobid == sink->daemon.jobid &&
|
||||
ORTE_PROC_MY_NAME->vpid == sink->daemon.vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &sink->daemon)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
|
||||
"%s read %d bytes from stdin - writing to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
|
||||
@ -258,8 +260,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
item != opal_list_get_end(&mca_iof_hnp_component.procs);
|
||||
item = opal_list_get_next(item)) {
|
||||
proct = (orte_iof_proc_t*)item;
|
||||
if (proct->name.jobid == rev->name.jobid &&
|
||||
proct->name.vpid == rev->name.vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
|
||||
/* found it - release corresponding event. This deletes
|
||||
* the read event and closes the file descriptor
|
||||
*/
|
||||
@ -317,8 +319,9 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
|
||||
continue;
|
||||
}
|
||||
/* is this the desired proc? */
|
||||
if (sink->name.jobid == rev->name.jobid &&
|
||||
sink->name.vpid == rev->name.vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) {
|
||||
/* output to the corresponding file */
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
|
||||
/* done */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -67,7 +67,7 @@ static void process_msg(int fd, short event, void *cbdata)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
|
||||
|
||||
if (ORTE_IOF_XON & stream) {
|
||||
/* re-start the stdin read event */
|
||||
if (NULL != mca_iof_hnp_component.stdinev &&
|
||||
@ -109,18 +109,21 @@ static void process_msg(int fd, short event, void *cbdata)
|
||||
NULL, &mca_iof_hnp_component.sinks);
|
||||
sink->daemon.jobid = mev->sender.jobid;
|
||||
sink->daemon.vpid = mev->sender.vpid;
|
||||
sink->daemon.epoch = mev->sender.epoch;
|
||||
}
|
||||
if (ORTE_IOF_STDERR & stream) {
|
||||
ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR,
|
||||
NULL, &mca_iof_hnp_component.sinks);
|
||||
sink->daemon.jobid = mev->sender.jobid;
|
||||
sink->daemon.vpid = mev->sender.vpid;
|
||||
sink->daemon.epoch = mev->sender.epoch;
|
||||
}
|
||||
if (ORTE_IOF_STDDIAG & stream) {
|
||||
ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG,
|
||||
NULL, &mca_iof_hnp_component.sinks);
|
||||
sink->daemon.jobid = mev->sender.jobid;
|
||||
sink->daemon.vpid = mev->sender.vpid;
|
||||
sink->daemon.epoch = mev->sender.epoch;
|
||||
}
|
||||
goto CLEAN_RETURN;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -128,6 +128,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
|
||||
int fdout;
|
||||
orte_odls_job_t *jobdat=NULL;
|
||||
int np, numdigs;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
|
||||
"%s iof:orted pushing fd %d for process %s",
|
||||
@ -150,8 +151,10 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
|
||||
item != opal_list_get_end(&mca_iof_orted_component.procs);
|
||||
item = opal_list_get_next(item)) {
|
||||
proct = (orte_iof_proc_t*)item;
|
||||
if (proct->name.jobid == dst_name->jobid &&
|
||||
proct->name.vpid == dst_name->vpid) {
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
|
||||
/* found it */
|
||||
goto SETUP;
|
||||
}
|
||||
@ -160,6 +163,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
|
||||
proct = OBJ_NEW(orte_iof_proc_t);
|
||||
proct->name.jobid = dst_name->jobid;
|
||||
proct->name.vpid = dst_name->vpid;
|
||||
proct->name.epoch = dst_name->epoch;
|
||||
opal_list_append(&mca_iof_orted_component.procs, &proct->super);
|
||||
/* see if we are to output to a file */
|
||||
if (NULL != orte_output_filename) {
|
||||
@ -285,6 +289,7 @@ static int orted_close(const orte_process_name_t* peer,
|
||||
{
|
||||
opal_list_item_t *item, *next_item;
|
||||
orte_iof_sink_t* sink;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
|
||||
|
||||
@ -294,8 +299,9 @@ static int orted_close(const orte_process_name_t* peer,
|
||||
sink = (orte_iof_sink_t*)item;
|
||||
next_item = opal_list_get_next(item);
|
||||
|
||||
if((sink->name.jobid == peer->jobid) &&
|
||||
(sink->name.vpid == peer->vpid) &&
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
|
||||
(source_tag & sink->tag)) {
|
||||
|
||||
/* No need to delete the event or close the file
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -65,6 +65,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
int32_t numbytes;
|
||||
opal_list_item_t *item;
|
||||
orte_iof_proc_t *proct;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
|
||||
|
||||
@ -119,9 +120,11 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
|
||||
if (ORTE_IOF_STDIN & sink->tag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* is this the desired proc? */
|
||||
if (sink->name.jobid == rev->name.jobid &&
|
||||
sink->name.vpid == rev->name.vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) {
|
||||
/* output to the corresponding file */
|
||||
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
|
||||
/* done */
|
||||
@ -178,8 +181,8 @@ CLEAN_RETURN:
|
||||
item != opal_list_get_end(&mca_iof_orted_component.procs);
|
||||
item = opal_list_get_next(item)) {
|
||||
proct = (orte_iof_proc_t*)item;
|
||||
if (proct->name.jobid == rev->name.jobid &&
|
||||
proct->name.vpid == rev->name.vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
|
||||
/* found it - release corresponding event. This deletes
|
||||
* the read event and closes the file descriptor
|
||||
*/
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -84,6 +84,9 @@ ORTE_DECLSPEC void orte_base_default_waitpid_fired(orte_process_name_t *proc, in
|
||||
/* setup singleton job data */
|
||||
ORTE_DECLSPEC void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid);
|
||||
|
||||
/* Lookup function to see if the child process has already finished. */
|
||||
ORTE_DECLSPEC bool orte_odls_base_default_check_finished(orte_process_name_t *proc);
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -513,7 +513,7 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the routing tree */
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -556,7 +556,7 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
return rc;
|
||||
}
|
||||
/* update the routing tree */
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -620,7 +620,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
char **slot_str=NULL;
|
||||
orte_jobid_t debugger;
|
||||
bool add_child;
|
||||
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:constructing child list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -945,6 +946,8 @@ find_my_procs:
|
||||
proc.jobid = jobdat->jobid;
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
proc.vpid = j;
|
||||
proc.epoch = ORTE_EPOCH_INVALID;
|
||||
proc.epoch = orte_ess.proc_get_epoch(&proc);
|
||||
/* get the vpid of the daemon that is to host this proc */
|
||||
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
@ -976,8 +979,11 @@ find_my_procs:
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc.jobid &&
|
||||
child->name->vpid == proc.vpid) {
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, child->name, &proc)) {
|
||||
/* do not duplicate this child on the list! */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"proc %s is on list and is %s",
|
||||
@ -1243,6 +1249,20 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* setup the epoch */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, child->name->epoch))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","epoch"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
opal_setenv(param, value, true, env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* setup the vpid */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -2419,6 +2439,7 @@ void orte_odls_base_notify_iof_complete(orte_process_name_t *proc)
|
||||
orte_odls_child_t *child;
|
||||
opal_list_item_t *item;
|
||||
int rc;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:notify_iof_complete for child %s",
|
||||
@ -2437,9 +2458,10 @@ void orte_odls_base_notify_iof_complete(orte_process_name_t *proc)
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) { /* found it */
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { /* found it */
|
||||
goto GOTCHILD;
|
||||
}
|
||||
}
|
||||
@ -2497,6 +2519,7 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
|
||||
opal_list_item_t *item;
|
||||
opal_buffer_t buffer;
|
||||
int rc;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* since we are going to be working with the global list of
|
||||
* children, we need to protect that list from modification
|
||||
@ -2510,9 +2533,11 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (proc->jobid == child->name->jobid &&
|
||||
proc->vpid == child->name->vpid) { /* found it */
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */
|
||||
child->state = ORTE_PROC_STATE_CALLED_ABORT;
|
||||
/* send ack */
|
||||
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
||||
@ -2533,6 +2558,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
orte_odls_job_t *jobdat, *jdat;
|
||||
opal_list_item_t *item;
|
||||
int rc;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired on child %s with status %d",
|
||||
@ -2552,8 +2578,10 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
if (proc->jobid == child->name->jobid &&
|
||||
proc->vpid == child->name->vpid) { /* found it */
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */
|
||||
goto GOTCHILD;
|
||||
}
|
||||
}
|
||||
@ -2893,6 +2921,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
|
||||
OBJ_CONSTRUCT(&proctmp, orte_proc_t);
|
||||
proctmp.name.jobid = ORTE_JOBID_WILDCARD;
|
||||
proctmp.name.vpid = ORTE_VPID_WILDCARD;
|
||||
proctmp.name.epoch = ORTE_EPOCH_WILDCARD;
|
||||
opal_pointer_array_add(&procarray, &proctmp);
|
||||
procptr = &procarray;
|
||||
do_cleanup = true;
|
||||
@ -3258,3 +3287,26 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
bool orte_odls_base_default_check_finished(orte_process_name_t *proc) {
|
||||
orte_odls_child_t *child;
|
||||
opal_list_item_t *item;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
|
||||
/* find this child */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */
|
||||
return child->fini_recvd;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -41,6 +41,7 @@
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/parse_options.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
|
||||
@ -185,6 +186,7 @@ int orte_odls_base_open(void)
|
||||
if (-1 == rank) {
|
||||
/* wildcard */
|
||||
nm->name.vpid = ORTE_VPID_WILDCARD;
|
||||
nm->name.epoch = ORTE_EPOCH_WILDCARD;
|
||||
} else if (rank < 0) {
|
||||
/* error out on bozo case */
|
||||
orte_show_help("help-odls-base.txt",
|
||||
@ -197,6 +199,7 @@ int orte_odls_base_open(void)
|
||||
* will be in the job - we'll check later
|
||||
*/
|
||||
nm->name.vpid = rank;
|
||||
nm->name.epoch = orte_ess.proc_get_epoch(&nm->name);
|
||||
}
|
||||
opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -77,14 +77,17 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
|
||||
/* if I am the HNP, then use me as the source */
|
||||
p_set->source.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
p_set->source.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
p_set->source.epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
}
|
||||
else {
|
||||
/* otherwise, set the HNP as the source */
|
||||
p_set->source.jobid = ORTE_PROC_MY_HNP->jobid;
|
||||
p_set->source.vpid = ORTE_PROC_MY_HNP->vpid;
|
||||
p_set->source.epoch = ORTE_PROC_MY_HNP->epoch;
|
||||
}
|
||||
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
|
||||
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -88,6 +88,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
/* process called "errmgr.abort_procs" */
|
||||
#define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28
|
||||
|
||||
/* commands used for fault recovery */
|
||||
#define ORTE_PROCESS_FAILED_NOTIFICATION (orte_daemon_cmd_flag_t) 30
|
||||
|
||||
/*
|
||||
* List object to locally store the process names and pids of
|
||||
* our children. This can subsequently be used to order termination
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -164,8 +164,7 @@ int mca_oob_tcp_component_open(void)
|
||||
#ifdef __WINDOWS__
|
||||
WSADATA win_sock_data;
|
||||
if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) {
|
||||
opal_output (0, "mca_oob_tcp_component_open: failed to initialise windows sockets: error %d\n",
|
||||
WSAGetLastError());
|
||||
opal_output (0, "mca_oob_tcp_component_init: failed to initialise windows sockets: error %d\n", WSAGetLastError());
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
@ -432,7 +431,7 @@ int mca_oob_tcp_component_close(void)
|
||||
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.tcp_available_devices))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
||||
#if 0
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_connections_lock);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_connections_return);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_pending_connections);
|
||||
@ -452,6 +451,7 @@ int mca_oob_tcp_component_close(void)
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list);
|
||||
|
||||
opal_output_close(mca_oob_tcp_output_handle);
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -1975,7 +1975,7 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri)
|
||||
peer->peer_state = MCA_OOB_TCP_CLOSED;
|
||||
/* clear any pending sends */
|
||||
while (NULL != (item = opal_list_remove_first(&peer->peer_send_queue))) {
|
||||
MCA_OOB_TCP_MSG_RETURN( ((mca_oob_tcp_msg_t *)item) );
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
peer->peer_send_msg = NULL;
|
||||
/* clear any pending recvs */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -605,14 +605,26 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
peer->peer_state);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, mca_oob_tcp_output_handle,
|
||||
"%s-%s mca_oob_tcp_peer_close(%p) sd %d state %d\n",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
(void *) peer,
|
||||
peer->peer_sd,
|
||||
peer->peer_state));
|
||||
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
|
||||
/* inform the ERRMGR framework that we have lost a connection so
|
||||
* it can decide if this is important, what to do about it, etc.
|
||||
*/
|
||||
if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(peer->peer_name.jobid, ORTE_JOB_STATE_COMM_FAILED,
|
||||
&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE)) {
|
||||
if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(
|
||||
peer->peer_name.jobid,
|
||||
ORTE_JOB_STATE_COMM_FAILED,
|
||||
&peer->peer_name,
|
||||
ORTE_PROC_STATE_COMM_FAILED,
|
||||
0,
|
||||
ORTE_ERROR_DEFAULT_EXIT_CODE)) {
|
||||
/* Should free the peer lock before we abort so we don't
|
||||
* get stuck in the orte_wait_kill when receiving messages in the
|
||||
* tcp OOB
|
||||
@ -891,11 +903,6 @@ int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer)
|
||||
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
|
||||
{
|
||||
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user;
|
||||
/* if we are abnormally terminating, ignore this */
|
||||
if (orte_abnormal_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&peer->peer_lock);
|
||||
switch(peer->peer_state) {
|
||||
case MCA_OOB_TCP_CONNECT_ACK:
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -62,10 +62,12 @@ int orte_plm_base_set_hnp_name(void)
|
||||
/* set the name */
|
||||
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
|
||||
ORTE_PROC_MY_NAME->vpid = 0;
|
||||
ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN;
|
||||
|
||||
/* copy it to the HNP field */
|
||||
ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
|
||||
/* done */
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -40,6 +40,7 @@
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/iof/iof.h"
|
||||
#include "orte/mca/ras/ras.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
@ -219,7 +220,12 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
* asked to communicate.
|
||||
*/
|
||||
orte_process_info.num_procs = jdatorted->num_procs;
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -371,6 +377,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
|
||||
/* push stdin - the IOF will know what to do with the specified target */
|
||||
name.jobid = job;
|
||||
name.vpid = jdata->stdin_target;
|
||||
name.epoch = orte_ess.proc_get_epoch(&name);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -606,7 +613,6 @@ CLEANUP:
|
||||
} else {
|
||||
orted_num_callback++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void orted_report_launch(int status, orte_process_name_t* sender,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -34,6 +34,7 @@
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -162,6 +163,8 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
|
||||
continue;
|
||||
}
|
||||
peer.vpid = v;
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
|
||||
/* don't worry about errors on the send here - just
|
||||
* issue it and keep going
|
||||
*/
|
||||
@ -238,6 +241,7 @@ int orte_plm_base_orted_terminate_job(orte_jobid_t jobid)
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = jobid;
|
||||
proc.name.vpid = ORTE_VPID_WILDCARD;
|
||||
proc.name.epoch = ORTE_EPOCH_WILDCARD;
|
||||
opal_pointer_array_add(&procs, &proc);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -335,6 +339,7 @@ int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs)
|
||||
continue;
|
||||
}
|
||||
peer.vpid = v;
|
||||
peer.epoch = orte_ess.proc_get_epoch(&peer);
|
||||
/* check to see if this daemon is known to be "dead" */
|
||||
if (proc->state > ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* don't try to send this */
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -44,6 +44,7 @@
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
@ -145,6 +146,7 @@ static void process_msg(int fd, short event, void *data)
|
||||
orte_job_t *jdata, *parent;
|
||||
opal_buffer_t answer;
|
||||
orte_vpid_t vpid;
|
||||
orte_epoch_t epoch;
|
||||
orte_proc_t *proc;
|
||||
orte_proc_state_t state;
|
||||
orte_exit_code_t exit_code;
|
||||
@ -392,6 +394,9 @@ static void process_msg(int fd, short event, void *data)
|
||||
break;
|
||||
}
|
||||
name.vpid = vpid;
|
||||
name.epoch = ORTE_EPOCH_INVALID;
|
||||
name.epoch = orte_ess.proc_get_epoch(&name);
|
||||
|
||||
/* unpack the pid */
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) {
|
||||
@ -467,6 +472,9 @@ static void process_msg(int fd, short event, void *data)
|
||||
break;
|
||||
|
||||
case ORTE_PLM_INIT_ROUTES_CMD:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive init routes command",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
count=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -479,6 +487,15 @@ static void process_msg(int fd, short event, void *data)
|
||||
break;
|
||||
}
|
||||
name.vpid = vpid;
|
||||
|
||||
count=1;
|
||||
opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH);
|
||||
name.epoch = epoch;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive Described rank %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&name)));
|
||||
/* update the errmgr state */
|
||||
orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED,
|
||||
&name, ORTE_PROC_STATE_REGISTERED,
|
||||
@ -491,9 +508,17 @@ static void process_msg(int fd, short event, void *data)
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive done with init routes command",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive unknown command",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
break;
|
||||
@ -516,7 +541,10 @@ static void process_msg(int fd, short event, void *data)
|
||||
if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
|
||||
orte_jobs_complete();
|
||||
}
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:receive done processing commands",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -54,6 +54,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -1525,6 +1526,8 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
|
||||
{
|
||||
char *param, *path, *tmp, *cmd, *basename, *dest_dir;
|
||||
int i;
|
||||
orte_epoch_t epoch;
|
||||
orte_process_name_t proc;
|
||||
|
||||
/* if a prefix is set, pass it to the bootproxy in a special way */
|
||||
if (NULL != app->prefix_dir) {
|
||||
@ -1633,6 +1636,17 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
|
||||
free(param);
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv);
|
||||
free(cmd);
|
||||
|
||||
/* set the epoch */
|
||||
proc.jobid = jobid;
|
||||
proc.vpid = vpid;
|
||||
proc.epoch = ORTE_EPOCH_MIN;
|
||||
epoch = orte_ess.proc_get_epoch(&proc);
|
||||
orte_util_convert_epoch_to_string(&cmd, epoch);
|
||||
param = mca_base_param_environ_variable("orte","ess","epoch");
|
||||
opal_setenv(param, cmd, true, argv);
|
||||
free(param);
|
||||
free(cmd);
|
||||
|
||||
/* set the number of procs */
|
||||
asprintf(&cmd, "%d", (int)num_procs);
|
||||
@ -1727,6 +1741,7 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
orte_node_t *node_from_map, *node;
|
||||
orte_odls_job_t *jobdat = NULL;
|
||||
opal_list_item_t *item = NULL;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* set the state to restart */
|
||||
jdata->state = ORTE_JOB_STATE_RESTART;
|
||||
@ -1751,8 +1766,10 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
|
||||
if (NULL == (proc_from_node = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (proc_from_node->name.jobid == proc->name.jobid &&
|
||||
proc_from_node->name.vpid == proc->name.vpid) {
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proc_from_node->name, &proc->name)) {
|
||||
/* got it! */
|
||||
OBJ_RELEASE(proc); /* keep accounting straight */
|
||||
opal_pointer_array_set_item(node->procs, i, NULL);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,6 +30,7 @@
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "opal/mca/sysinfo/sysinfo_types.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
@ -451,6 +452,10 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
|
||||
/* we do not set the vpid here - this will be done
|
||||
* during a second phase
|
||||
*/
|
||||
|
||||
/* We do set the epoch here since they all start with the same value. */
|
||||
proc->name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
proc->app_idx = app_idx;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:base:claim_slot: created new proc %s",
|
||||
@ -554,6 +559,11 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
proc->name.vpid = vpid;
|
||||
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
|
||||
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
|
||||
if (ORTE_NODE_RANK_INVALID == proc->name.epoch) {
|
||||
proc->name.epoch = ORTE_EPOCH_MIN;
|
||||
}
|
||||
}
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
@ -590,6 +600,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
proc->name.vpid = vpid;
|
||||
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
|
||||
}
|
||||
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
@ -822,6 +833,7 @@ int orte_rmaps_base_define_daemons(orte_job_t *jdata)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc->name.vpid = daemons->num_procs; /* take the next available vpid */
|
||||
proc->name.epoch = ORTE_EPOCH_MIN;
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
@ -1000,6 +1012,7 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc->name.vpid = jdata->num_procs; /* take the next available vpid */
|
||||
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
|
||||
proc->node = node;
|
||||
proc->nodename = node->name;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -40,6 +40,7 @@
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
@ -500,6 +501,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
proc->name.vpid = rank;
|
||||
/* Either init or update the epoch. */
|
||||
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
|
||||
|
||||
proc->slot_list = strdup(rfmap->slot_list);
|
||||
/* insert the proc into the proper place */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -35,6 +35,7 @@
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/util/hostfile/hostfile.h"
|
||||
#include "orte/util/dash_host/dash_host.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -234,6 +235,8 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
|
||||
}
|
||||
/* assign the vpid */
|
||||
proc->name.vpid = vpid++;
|
||||
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
|
||||
|
||||
/* add to the jdata proc array */
|
||||
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -341,6 +341,7 @@ static void recv_construct(rmcast_base_recv_t *ptr)
|
||||
{
|
||||
ptr->name.jobid = ORTE_JOBID_INVALID;
|
||||
ptr->name.vpid = ORTE_VPID_INVALID;
|
||||
ptr->name.epoch= ORTE_EPOCH_INVALID;
|
||||
ptr->channel = ORTE_RMCAST_INVALID_CHANNEL;
|
||||
OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t);
|
||||
ptr->seq_num = ORTE_RMCAST_SEQ_INVALID;
|
||||
@ -428,6 +429,7 @@ static void recvlog_construct(rmcast_recv_log_t *ptr)
|
||||
{
|
||||
ptr->name.jobid = ORTE_JOBID_INVALID;
|
||||
ptr->name.vpid = ORTE_VPID_INVALID;
|
||||
ptr->name.epoch = ORTE_EPOCH_INVALID;
|
||||
OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t);
|
||||
}
|
||||
static void recvlog_destruct(rmcast_recv_log_t *ptr)
|
||||
@ -436,6 +438,7 @@ static void recvlog_destruct(rmcast_recv_log_t *ptr)
|
||||
|
||||
ptr->name.jobid = ORTE_JOBID_INVALID;
|
||||
ptr->name.vpid = ORTE_VPID_INVALID;
|
||||
ptr->name.epoch = ORTE_EPOCH_INVALID;
|
||||
while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -678,6 +681,7 @@ static int tcp_recv(orte_process_name_t *name,
|
||||
/* caller requested id of sender */
|
||||
name->jobid = recvptr->name.jobid;
|
||||
name->vpid = recvptr->name.vpid;
|
||||
name->epoch= recvptr->name.epoch;
|
||||
}
|
||||
*seq_num = recvptr->seq_num;
|
||||
*msg = recvptr->iovec_array;
|
||||
@ -772,6 +776,7 @@ static int tcp_recv_buffer(orte_process_name_t *name,
|
||||
/* caller requested id of sender */
|
||||
name->jobid = recvptr->name.jobid;
|
||||
name->vpid = recvptr->name.vpid;
|
||||
name->epoch= recvptr->name.epoch;
|
||||
}
|
||||
*seq_num = recvptr->seq_num;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -457,6 +460,7 @@ static int udp_recv(orte_process_name_t *name,
|
||||
/* caller requested id of sender */
|
||||
name->jobid = recvptr->name.jobid;
|
||||
name->vpid = recvptr->name.vpid;
|
||||
name->epoch= recvptr->name.epoch;
|
||||
}
|
||||
*seq_num = recvptr->seq_num;
|
||||
*msg = recvptr->iovec_array;
|
||||
@ -549,6 +553,7 @@ static int udp_recv_buffer(orte_process_name_t *name,
|
||||
/* caller requested id of sender */
|
||||
name->jobid = recvptr->name.jobid;
|
||||
name->vpid = recvptr->name.vpid;
|
||||
name->epoch= recvptr->name.epoch;
|
||||
}
|
||||
*seq_num = recvptr->seq_num;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {
|
||||
|
@ -1,4 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -64,12 +67,14 @@ static void msg_pkt_constructor(orte_msg_packet_t *pkt)
|
||||
{
|
||||
pkt->sender.jobid = ORTE_JOBID_INVALID;
|
||||
pkt->sender.vpid = ORTE_VPID_INVALID;
|
||||
pkt->sender.epoch = ORTE_EPOCH_INVALID;
|
||||
pkt->buffer = NULL;
|
||||
}
|
||||
static void msg_pkt_destructor(orte_msg_packet_t *pkt)
|
||||
{
|
||||
pkt->sender.jobid = ORTE_JOBID_INVALID;
|
||||
pkt->sender.vpid = ORTE_VPID_INVALID;
|
||||
pkt->sender.epoch = ORTE_EPOCH_INVALID;
|
||||
if (NULL != pkt->buffer) {
|
||||
OBJ_RELEASE(pkt->buffer);
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -138,10 +138,15 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
|
||||
ORTE_PROC_IS_DAEMON &&
|
||||
orte_process_info.num_procs < num_procs) {
|
||||
orte_process_info.num_procs = num_procs;
|
||||
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* if we changed it, then we better update the routed
|
||||
* tree so daemon collectives work correctly
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -360,6 +360,7 @@ rml_oob_queued_progress(int fd, short event, void *arg)
|
||||
origin = hdr->origin;
|
||||
|
||||
next = orte_routed.get_route(&hdr->destination);
|
||||
#if 0
|
||||
if (next.vpid == ORTE_VPID_INVALID) {
|
||||
opal_output(0,
|
||||
"%s:queued progress tried routing message from %s to %s:%d, can't find route",
|
||||
@ -370,6 +371,7 @@ rml_oob_queued_progress(int fd, short event, void *arg)
|
||||
opal_backtrace_print(stderr);
|
||||
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
|
||||
opal_output(0, "%s:queued progress trying to get message from %s to %s:%d, routing loop",
|
||||
@ -467,6 +469,7 @@ rml_oob_recv_route_callback(int status,
|
||||
|
||||
next = orte_routed.get_route(&hdr->destination);
|
||||
if (next.vpid == ORTE_VPID_INVALID) {
|
||||
#if 0
|
||||
opal_output(0, "%s:route_callback tried routing message from %s to %s:%d, can't find route",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&origin),
|
||||
@ -474,6 +477,8 @@ rml_oob_recv_route_callback(int status,
|
||||
hdr->tag);
|
||||
opal_backtrace_print(stderr);
|
||||
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
|
||||
|
@ -2,6 +2,9 @@
|
||||
* Copyright (c) 2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -77,7 +80,8 @@ orte_rml_oob_purge(orte_process_name_t *peer)
|
||||
orte_rml_oob_queued_msg_t *qmsg;
|
||||
orte_rml_oob_msg_header_t *hdr;
|
||||
orte_process_name_t step;
|
||||
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* clear the oob contact info and pending messages */
|
||||
orte_rml_oob_module.active_oob->oob_set_addr(peer, NULL);
|
||||
|
||||
@ -89,12 +93,14 @@ orte_rml_oob_purge(orte_process_name_t *peer)
|
||||
qmsg = (orte_rml_oob_queued_msg_t*)item;
|
||||
hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base;
|
||||
step = orte_routed.get_route(&hdr->destination);
|
||||
if (peer->jobid == hdr->destination.jobid &&
|
||||
peer->vpid == hdr->destination.vpid) {
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, peer, &hdr->destination)) {
|
||||
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
|
||||
OBJ_RELEASE(item);
|
||||
} else if (step.jobid == hdr->destination.jobid &&
|
||||
step.vpid == hdr->destination.vpid) {
|
||||
} else if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &step, &hdr->destination)) {
|
||||
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
|
@ -1,4 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -17,7 +20,6 @@
|
||||
|
||||
#include "rml_oob.h"
|
||||
|
||||
|
||||
static void
|
||||
orte_rml_recv_msg_callback(int status,
|
||||
struct orte_process_name_t* peer,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -62,6 +62,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
|
||||
pkt = OBJ_NEW(orte_msg_packet_t); \
|
||||
pkt->sender.jobid = (sndr)->jobid; \
|
||||
pkt->sender.vpid = (sndr)->vpid; \
|
||||
pkt->sender.epoch = (sndr)->epoch; \
|
||||
if ((crt)) { \
|
||||
pkt->buffer = OBJ_NEW(opal_buffer_t); \
|
||||
opal_dss.copy_payload(pkt->buffer, *(buf)); \
|
||||
@ -84,6 +85,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
|
||||
pkt = OBJ_NEW(orte_msg_packet_t); \
|
||||
pkt->sender.jobid = (sndr)->jobid; \
|
||||
pkt->sender.vpid = (sndr)->vpid; \
|
||||
pkt->sender.epoch = (sndr)->epoch; \
|
||||
if ((crt)) { \
|
||||
pkt->buffer = OBJ_NEW(opal_buffer_t); \
|
||||
opal_dss.copy_payload(pkt->buffer, *(buf)); \
|
||||
@ -189,6 +191,9 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
|
||||
|
||||
#define ORTE_RML_TAG_SUBSCRIBE 46
|
||||
|
||||
/* For Epoch Updates */
|
||||
#define ORTE_RML_TAG_EPOCH_CHANGE 47
|
||||
|
||||
#define ORTE_RML_TAG_MAX 100
|
||||
|
||||
|
||||
|
@ -4,6 +4,9 @@
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -62,6 +65,7 @@ static void jfamconst(orte_routed_jobfam_t *ptr)
|
||||
{
|
||||
ptr->route.jobid = ORTE_JOBID_INVALID;
|
||||
ptr->route.vpid = ORTE_VPID_INVALID;
|
||||
ptr->route.epoch = ORTE_EPOCH_INVALID;
|
||||
ptr->hnp_uri = NULL;
|
||||
}
|
||||
static void jfamdest(orte_routed_jobfam_t *ptr)
|
||||
@ -113,6 +117,7 @@ orte_routed_base_open(void)
|
||||
jfam = OBJ_NEW(orte_routed_jobfam_t);
|
||||
jfam->route.jobid = ORTE_PROC_MY_HNP->jobid;
|
||||
jfam->route.vpid = ORTE_PROC_MY_HNP->vpid;
|
||||
jfam->route.epoch = ORTE_PROC_MY_HNP->epoch;
|
||||
jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
|
||||
if (NULL != orte_process_info.my_hnp_uri) {
|
||||
jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri);
|
||||
@ -247,6 +252,7 @@ void orte_routed_base_update_hnps(opal_buffer_t *buf)
|
||||
jfam->job_family = jobfamily;
|
||||
jfam->route.jobid = name.jobid;
|
||||
jfam->route.vpid = name.vpid;
|
||||
jfam->route.epoch = name.epoch;
|
||||
jfam->hnp_uri = strdup(uri);
|
||||
done:
|
||||
free(uri);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -127,6 +127,7 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
||||
orte_std_cntr_t cnt;
|
||||
char *rml_uri;
|
||||
orte_vpid_t vpid;
|
||||
orte_epoch_t epoch;
|
||||
int rc;
|
||||
|
||||
if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
|
||||
@ -144,13 +145,18 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
|
||||
/* unpack the data for each entry */
|
||||
cnt = 1;
|
||||
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) {
|
||||
|
||||
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, ORTE_EPOCH))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routed_binomial:callback got uri %s for job %s rank %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,6 +29,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
@ -44,7 +48,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
|
||||
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static bool route_is_defined(const orte_process_name_t *target);
|
||||
static int update_routing_tree(void);
|
||||
static int update_routing_tree(orte_jobid_t jobid);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
@ -143,7 +147,8 @@ static int delete_route(orte_process_name_t *proc)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (proc->jobid == ORTE_JOBID_INVALID ||
|
||||
proc->vpid == ORTE_VPID_INVALID) {
|
||||
proc->vpid == ORTE_VPID_INVALID ||
|
||||
proc->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -211,7 +216,8 @@ static int update_route(orte_process_name_t *target,
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -269,6 +275,8 @@ static int update_route(orte_process_name_t *target,
|
||||
ORTE_NAME_PRINT(route)));
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -282,6 +290,8 @@ static int update_route(orte_process_name_t *target,
|
||||
jfam->job_family = jfamily;
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
|
||||
|
||||
opal_pointer_array_add(&orte_routed_jobfams, jfam);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -304,11 +314,12 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
ret = ORTE_NAME_INVALID;
|
||||
goto found;
|
||||
}
|
||||
|
||||
|
||||
/* if it is me, then the route is just direct */
|
||||
if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
|
||||
ret = target;
|
||||
@ -376,48 +387,55 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
}
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
|
||||
/* if we are not using static ports and this is going to the HNP, send direct */
|
||||
if (!orte_static_ports &&
|
||||
ORTE_PROC_MY_HNP->jobid == target->jobid &&
|
||||
ORTE_PROC_MY_HNP->vpid == target->vpid) {
|
||||
if( !orte_static_ports &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing not enabled - going direct",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ret = target;
|
||||
"%s routing to the HNP through my PLM parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
|
||||
ret = ORTE_PROC_MY_PARENT;
|
||||
goto found;
|
||||
}
|
||||
|
||||
|
||||
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
/* find out what daemon hosts this proc */
|
||||
if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
/*ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);*/
|
||||
ret = ORTE_NAME_INVALID;
|
||||
goto found;
|
||||
}
|
||||
|
||||
|
||||
/* if the daemon is me, then send direct to the target! */
|
||||
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
|
||||
ret = target;
|
||||
goto found;
|
||||
} else {
|
||||
/* search routing tree for next step to that daemon */
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
if (child->vpid == daemon.vpid) {
|
||||
/* the child is hosting the proc - just send it there */
|
||||
ret = &daemon;
|
||||
goto found;
|
||||
}
|
||||
/* otherwise, see if the daemon we need is below the child */
|
||||
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
|
||||
/* yep - we need to step through this child */
|
||||
daemon.vpid = child->vpid;
|
||||
ret = &daemon;
|
||||
goto found;
|
||||
}
|
||||
|
||||
startover:
|
||||
/* search routing tree for next step to that daemon */
|
||||
for (item = opal_list_get_first(&my_children);
|
||||
item != opal_list_get_end(&my_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
if (child->vpid == daemon.vpid) {
|
||||
/* the child is hosting the proc - just send it there */
|
||||
ret = &daemon;
|
||||
goto found;
|
||||
}
|
||||
/* otherwise, see if the daemon we need is below the child */
|
||||
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
|
||||
/* yep - we need to step through this child */
|
||||
daemon.vpid = child->vpid;
|
||||
|
||||
/* If the daemon to which we should be routing is dead, then update
|
||||
* the routing tree and start over. */
|
||||
if (!orte_util_proc_is_running(&daemon)) {
|
||||
update_routing_tree(daemon.jobid);
|
||||
goto startover;
|
||||
}
|
||||
|
||||
ret = &daemon;
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
|
||||
@ -425,9 +443,12 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
* any of our children, so we have to step up through our parent
|
||||
*/
|
||||
daemon.vpid = my_parent.vpid;
|
||||
|
||||
ret = &daemon;
|
||||
|
||||
found:
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
|
||||
"%s routed_binomial_get(%s) --> %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -843,17 +864,22 @@ static int set_lifeline(orte_process_name_t *proc)
|
||||
*/
|
||||
local_lifeline.jobid = proc->jobid;
|
||||
local_lifeline.vpid = proc->vpid;
|
||||
local_lifeline.epoch = proc->epoch;
|
||||
lifeline = &local_lifeline;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int binomial_tree(int rank, int parent, int me, int num_procs,
|
||||
int *nchildren, opal_list_t *childrn, opal_bitmap_t *relatives)
|
||||
int *nchildren, opal_list_t *childrn,
|
||||
opal_bitmap_t *relatives, bool mine, orte_jobid_t jobid)
|
||||
{
|
||||
int i, bitmap, peer, hibit, mask, found;
|
||||
orte_routed_tree_t *child;
|
||||
opal_bitmap_t *relations;
|
||||
orte_process_name_t proc_name;
|
||||
|
||||
proc_name.jobid = jobid;
|
||||
|
||||
/* is this me? */
|
||||
if (me == rank) {
|
||||
@ -868,15 +894,43 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
|
||||
child = OBJ_NEW(orte_routed_tree_t);
|
||||
child->vpid = peer;
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
|
||||
"%s routed:binomial found child %s",
|
||||
"%s routed:binomial %d found child %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
rank,
|
||||
ORTE_VPID_PRINT(child->vpid)));
|
||||
if (NULL != childrn) {
|
||||
|
||||
/* If the process we are looking at next is already dead, then
|
||||
* we inherit its children. Keep up with the process name of
|
||||
* that process so we can check it's state.
|
||||
*/
|
||||
proc_name.vpid = peer;
|
||||
proc_name.epoch = orte_util_lookup_epoch(&proc_name);
|
||||
|
||||
if (!orte_util_proc_is_running(&proc_name)
|
||||
&& ORTE_EPOCH_MIN < proc_name.epoch
|
||||
&& ORTE_EPOCH_INVALID != proc_name.epoch) {
|
||||
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
|
||||
"%s routed:binomial child %s is dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(child->vpid)));
|
||||
relations = relatives;
|
||||
|
||||
/* Leave mine as it is. If it was true, then we want to
|
||||
* inherit the dead node's children as our own. If it wasn't
|
||||
* then we want it's relatives as our own. */
|
||||
binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, mine, jobid);
|
||||
|
||||
/* If we use the proc_is_running as a way of measuring of the
|
||||
* process is dead, then we get screwed up on startup. By also
|
||||
* testing the epoch, we make sure that the process really did
|
||||
* start up and then died. */
|
||||
} else if (mine) {
|
||||
/* this is a direct child - add it to my list */
|
||||
opal_list_append(childrn, &child->super);
|
||||
(*nchildren)++;
|
||||
/* setup the relatives bitmap */
|
||||
opal_bitmap_init(&child->relatives, num_procs);
|
||||
|
||||
/* point to the relatives */
|
||||
relations = &child->relatives;
|
||||
} else {
|
||||
@ -886,7 +940,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
|
||||
relations = relatives;
|
||||
}
|
||||
/* search for this child's relatives */
|
||||
binomial_tree(0, 0, peer, num_procs, NULL, NULL, relations);
|
||||
binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false, jobid);
|
||||
}
|
||||
}
|
||||
return parent;
|
||||
@ -902,7 +956,13 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
|
||||
peer = rank | mask;
|
||||
if (peer < num_procs) {
|
||||
/* execute compute on this child */
|
||||
if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives))) {
|
||||
if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine, jobid))) {
|
||||
proc_name.vpid = found;
|
||||
|
||||
if (!orte_util_proc_is_running(&proc_name) && ORTE_EPOCH_MIN < orte_util_lookup_epoch(&proc_name)) {
|
||||
return parent;
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
}
|
||||
@ -910,7 +970,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int update_routing_tree(void)
|
||||
static int update_routing_tree(orte_jobid_t jobid)
|
||||
{
|
||||
orte_routed_tree_t *child;
|
||||
int j;
|
||||
@ -933,8 +993,9 @@ static int update_routing_tree(void)
|
||||
* lie underneath their branch
|
||||
*/
|
||||
my_parent.vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
|
||||
orte_process_info.num_procs,
|
||||
&num_children, &my_children, NULL);
|
||||
orte_process_info.max_procs,
|
||||
&num_children, &my_children, NULL, true, jobid);
|
||||
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_routed_base_output)) {
|
||||
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), my_parent.vpid, num_children);
|
||||
@ -943,7 +1004,7 @@ static int update_routing_tree(void)
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_routed_tree_t*)item;
|
||||
opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid);
|
||||
for (j=0; j < (int)orte_process_info.num_procs; j++) {
|
||||
for (j=0; j < (int)orte_process_info.max_procs; j++) {
|
||||
if (opal_bitmap_is_set_bit(&child->relatives, j)) {
|
||||
opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
|
||||
}
|
||||
|
@ -4,6 +4,9 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -47,7 +50,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
|
||||
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static bool route_is_defined(const orte_process_name_t *target);
|
||||
static int update_routing_tree(void);
|
||||
static int update_routing_tree(orte_jobid_t jobid);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
@ -135,7 +138,8 @@ static int delete_route(orte_process_name_t *proc)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (proc->jobid == ORTE_JOBID_INVALID ||
|
||||
proc->vpid == ORTE_VPID_INVALID) {
|
||||
proc->vpid == ORTE_VPID_INVALID ||
|
||||
proc->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -195,7 +199,8 @@ static int update_route(orte_process_name_t *target,
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -252,6 +257,8 @@ static int update_route(orte_process_name_t *target,
|
||||
ORTE_NAME_PRINT(route)));
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -265,6 +272,8 @@ static int update_route(orte_process_name_t *target,
|
||||
jfam->job_family = jfamily;
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
|
||||
|
||||
opal_pointer_array_add(&orte_routed_jobfams, jfam);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -287,7 +296,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
ret = ORTE_NAME_INVALID;
|
||||
goto found;
|
||||
}
|
||||
@ -354,6 +364,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
goto found;
|
||||
}
|
||||
|
||||
/* Initialize daemon's epoch, based on its current vpid/jobid */
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
|
||||
/* if the daemon is me, then send direct to the target! */
|
||||
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
|
||||
ret = target;
|
||||
@ -798,12 +811,14 @@ static int set_lifeline(orte_process_name_t *proc)
|
||||
*/
|
||||
local_lifeline.jobid = proc->jobid;
|
||||
local_lifeline.vpid = proc->vpid;
|
||||
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
|
||||
|
||||
lifeline = &local_lifeline;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_routing_tree(void)
|
||||
static int update_routing_tree(orte_jobid_t jobid)
|
||||
{
|
||||
/* nothing to do here */
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -36,7 +39,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
|
||||
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static bool route_is_defined(const orte_process_name_t *target);
|
||||
static int update_routing_tree(void);
|
||||
static int update_routing_tree(orte_jobid_t jobid);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
@ -131,7 +134,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
orte_process_name_t *ret;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
ret = ORTE_NAME_INVALID;
|
||||
} else {
|
||||
/* all routes are direct */
|
||||
@ -305,7 +309,7 @@ static int set_lifeline(orte_process_name_t *proc)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_routing_tree(void)
|
||||
static int update_routing_tree(orte_jobid_t jobid)
|
||||
{
|
||||
/* nothing to do here */
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -43,7 +46,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
|
||||
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static bool route_is_defined(const orte_process_name_t *target);
|
||||
static int update_routing_tree(void);
|
||||
static int update_routing_tree(orte_jobid_t jobid);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
@ -126,7 +129,8 @@ static int delete_route(orte_process_name_t *proc)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (proc->jobid == ORTE_JOBID_INVALID ||
|
||||
proc->vpid == ORTE_VPID_INVALID) {
|
||||
proc->vpid == ORTE_VPID_INVALID ||
|
||||
proc->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -194,7 +198,8 @@ static int update_route(orte_process_name_t *target,
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -252,6 +257,7 @@ static int update_route(orte_process_name_t *target,
|
||||
ORTE_NAME_PRINT(route)));
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = route->epoch;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -265,6 +271,7 @@ static int update_route(orte_process_name_t *target,
|
||||
jfam->job_family = jfamily;
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = route->epoch;
|
||||
opal_pointer_array_add(&orte_routed_jobfams, jfam);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -338,14 +345,14 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
|
||||
/* if we are not using static ports and this is going to the HNP, send direct */
|
||||
if (!orte_static_ports &&
|
||||
ORTE_PROC_MY_HNP->jobid == target->jobid &&
|
||||
ORTE_PROC_MY_HNP->vpid == target->vpid) {
|
||||
/* if we are not using static ports and this is going to the HNP, send directly through my parent */
|
||||
if( !orte_static_ports &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing not enabled - going direct",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ret = target;
|
||||
"%s routing to the HNP through my parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
|
||||
ret = ORTE_PROC_MY_PARENT;
|
||||
goto found;
|
||||
}
|
||||
|
||||
@ -357,6 +364,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
goto found;
|
||||
}
|
||||
|
||||
/* Initialize daemon's epoch, based on its current vpid/jobid */
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
|
||||
/* if the daemon is me, then send direct to the target! */
|
||||
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
|
||||
ret = target;
|
||||
@ -376,6 +386,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
/* we are at end of chain - wrap around */
|
||||
daemon.vpid = 0;
|
||||
}
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
ret = &daemon;
|
||||
}
|
||||
}
|
||||
@ -715,12 +726,13 @@ static int set_lifeline(orte_process_name_t *proc)
|
||||
*/
|
||||
local_lifeline.jobid = proc->jobid;
|
||||
local_lifeline.vpid = proc->vpid;
|
||||
local_lifeline.epoch = proc->epoch;
|
||||
lifeline = &local_lifeline;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_routing_tree(void)
|
||||
static int update_routing_tree(orte_jobid_t jobid)
|
||||
{
|
||||
/* if I am anything other than a daemon or the HNP, this
|
||||
* is a meaningless command as I am not allowed to route
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -44,7 +47,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
|
||||
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static bool route_is_defined(const orte_process_name_t *target);
|
||||
static int update_routing_tree(void);
|
||||
static int update_routing_tree(orte_jobid_t jobid);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
@ -142,7 +145,8 @@ static int delete_route(orte_process_name_t *proc)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (proc->jobid == ORTE_JOBID_INVALID ||
|
||||
proc->vpid == ORTE_VPID_INVALID) {
|
||||
proc->vpid == ORTE_VPID_INVALID ||
|
||||
proc->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -210,7 +214,8 @@ static int update_route(orte_process_name_t *target,
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
@ -268,6 +273,7 @@ static int update_route(orte_process_name_t *target,
|
||||
ORTE_NAME_PRINT(route)));
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = route->epoch;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -281,6 +287,7 @@ static int update_route(orte_process_name_t *target,
|
||||
jfam->job_family = jfamily;
|
||||
jfam->route.jobid = route->jobid;
|
||||
jfam->route.vpid = route->vpid;
|
||||
jfam->route.epoch = route->epoch;
|
||||
opal_pointer_array_add(&orte_routed_jobfams, jfam);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -303,7 +310,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
uint16_t jfamily;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
ret = ORTE_NAME_INVALID;
|
||||
goto found;
|
||||
}
|
||||
@ -362,14 +370,14 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
|
||||
/* THIS CAME FROM OUR OWN JOB FAMILY... */
|
||||
|
||||
/* if we are not using static ports and this is going to the HNP, send direct */
|
||||
if (!orte_static_ports &&
|
||||
ORTE_PROC_MY_HNP->jobid == target->jobid &&
|
||||
ORTE_PROC_MY_HNP->vpid == target->vpid) {
|
||||
/* if we are not using static ports and this is going to the HNP, send directly through my parent */
|
||||
if( !orte_static_ports &&
|
||||
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
|
||||
"%s routing not enabled - going direct",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ret = target;
|
||||
"%s routing to the HNP through my parent %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
|
||||
ret = ORTE_PROC_MY_PARENT;
|
||||
goto found;
|
||||
}
|
||||
|
||||
@ -400,6 +408,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
|
||||
/* yep - we need to step through this child */
|
||||
daemon.vpid = child->vpid;
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
ret = &daemon;
|
||||
goto found;
|
||||
}
|
||||
@ -410,6 +419,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
* any of our children, so we have to step up through our parent
|
||||
*/
|
||||
daemon.vpid = my_parent.vpid;
|
||||
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
|
||||
|
||||
ret = &daemon;
|
||||
|
||||
found:
|
||||
@ -765,6 +776,7 @@ static int set_lifeline(orte_process_name_t *proc)
|
||||
*/
|
||||
local_lifeline.jobid = proc->jobid;
|
||||
local_lifeline.vpid = proc->vpid;
|
||||
local_lifeline.epoch = proc->epoch;
|
||||
lifeline = &local_lifeline;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -815,7 +827,7 @@ static void radix_tree(int rank, int *num_children,
|
||||
}
|
||||
}
|
||||
|
||||
static int update_routing_tree(void)
|
||||
static int update_routing_tree(orte_jobid_t jobid)
|
||||
{
|
||||
orte_routed_tree_t *child;
|
||||
int j;
|
||||
@ -857,6 +869,7 @@ static int update_routing_tree(void)
|
||||
my_parent.vpid = (Ii-Sum) % NInPrevLevel;
|
||||
my_parent.vpid += (Sum - NInPrevLevel);
|
||||
}
|
||||
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
|
||||
|
||||
/* compute my direct children and the bitmap that shows which vpids
|
||||
* lie underneath their branch
|
||||
|
@ -3,6 +3,9 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -191,10 +194,12 @@ typedef int (*orte_routed_module_get_wireup_info_fn_t)(opal_buffer_t *buf);
|
||||
* of "leaves" for this process and identifies the vpid of the parent
|
||||
* sitting above this process in the tree.
|
||||
*
|
||||
* @param [in] jobid The jobid of the routing tree that needs to be updated.
|
||||
*
|
||||
* @retval ORTE_SUCCESS The operation completed successfully
|
||||
* @retval ORTE_ERROR_xxx The specifed error occurred
|
||||
*/
|
||||
typedef int (*orte_routed_module_update_routing_tree_fn_t)(void);
|
||||
typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid);
|
||||
|
||||
/*
|
||||
* Get the routing tree for this process
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -16,6 +19,7 @@
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -37,7 +41,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
|
||||
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
|
||||
static int route_lost(const orte_process_name_t *route);
|
||||
static bool route_is_defined(const orte_process_name_t *target);
|
||||
static int update_routing_tree(void);
|
||||
static int update_routing_tree(orte_jobid_t jobid);
|
||||
static orte_vpid_t get_routing_tree(opal_list_t *children);
|
||||
static int get_wireup_info(opal_buffer_t *buf);
|
||||
static int set_lifeline(orte_process_name_t *proc);
|
||||
@ -129,7 +133,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
|
||||
orte_process_name_t *ret;
|
||||
|
||||
if (target->jobid == ORTE_JOBID_INVALID ||
|
||||
target->vpid == ORTE_VPID_INVALID) {
|
||||
target->vpid == ORTE_VPID_INVALID ||
|
||||
target->epoch == ORTE_EPOCH_INVALID) {
|
||||
ret = ORTE_NAME_INVALID;
|
||||
} else {
|
||||
/* a slave must always route via its parent daemon */
|
||||
@ -251,9 +256,12 @@ static int route_lost(const orte_process_name_t *route)
|
||||
|
||||
static bool route_is_defined(const orte_process_name_t *target)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* only the route to my daemon is defined */
|
||||
if (target->jobid != ORTE_PROC_MY_DAEMON->jobid ||
|
||||
target->vpid != ORTE_PROC_MY_DAEMON->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, target, ORTE_PROC_MY_DAEMON)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -267,12 +275,14 @@ static int set_lifeline(orte_process_name_t *proc)
|
||||
*/
|
||||
local_lifeline.jobid = proc->jobid;
|
||||
local_lifeline.vpid = proc->vpid;
|
||||
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
|
||||
|
||||
lifeline = &local_lifeline;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_routing_tree(void)
|
||||
static int update_routing_tree(orte_jobid_t jobid)
|
||||
{
|
||||
/* this is a meaningless command for a slave as I am not allowed to route */
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -67,6 +70,7 @@ typedef struct {
|
||||
opal_list_item_t super;
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
orte_epoch_t epoch;
|
||||
char *file;
|
||||
int tick;
|
||||
bool check_size;
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
@ -81,6 +81,7 @@ void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *
|
||||
{
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
@ -91,6 +92,7 @@ void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *
|
||||
{
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
|
||||
@ -468,12 +470,15 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_std_cntr_t count = 1;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/*
|
||||
* Do not send to self, as that is silly.
|
||||
*/
|
||||
if (peer->jobid == ORTE_PROC_MY_HNP->jobid &&
|
||||
peer->vpid == ORTE_PROC_MY_HNP->vpid ) {
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
||||
"%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
|
||||
@ -650,6 +655,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
char *global_snapshot_handle = NULL;
|
||||
char *tmp_str = NULL;
|
||||
int seq_num;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/*
|
||||
* Noop if invalid peer, or peer not specified (JJH Double check this)
|
||||
@ -660,11 +666,12 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/*
|
||||
* Do not send to self, as that is silly.
|
||||
*/
|
||||
if (peer->jobid == ORTE_PROC_MY_HNP->jobid &&
|
||||
peer->vpid == ORTE_PROC_MY_HNP->vpid ) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
||||
"%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
@ -427,6 +427,7 @@ int global_coord_start_ckpt(orte_snapc_base_quiesce_t *datum)
|
||||
new_proc = OBJ_NEW(orte_proc_t);
|
||||
new_proc->name.jobid = proc->name.jobid;
|
||||
new_proc->name.vpid = proc->name.vpid;
|
||||
new_proc->name.epoch = proc->name.epoch;
|
||||
new_proc->node = OBJ_NEW(orte_node_t);
|
||||
new_proc->node->name = proc->node->name;
|
||||
opal_list_append(migrating_procs, &new_proc->super);
|
||||
@ -590,6 +591,7 @@ static int global_init_job_structs(void)
|
||||
orte_proc_t **procs = NULL;
|
||||
orte_std_cntr_t i = 0;
|
||||
orte_vpid_t p = 0;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* look up job data object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) {
|
||||
@ -616,9 +618,12 @@ static int global_init_job_structs(void)
|
||||
|
||||
orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid;
|
||||
orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid;
|
||||
orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch;
|
||||
|
||||
if( orted_snapshot->process_name.jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
orted_snapshot->process_name.vpid == ORTE_PROC_MY_NAME->vpid ) {
|
||||
mask = ORTE_NS_CMP_JOBID;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, &orted_snapshot->process_name, ORTE_PROC_MY_NAME)) {
|
||||
global_coord_has_local_children = true;
|
||||
}
|
||||
|
||||
@ -631,6 +636,7 @@ static int global_init_job_structs(void)
|
||||
|
||||
app_snapshot->process_name.jobid = procs[p]->name.jobid;
|
||||
app_snapshot->process_name.vpid = procs[p]->name.vpid;
|
||||
app_snapshot->process_name.epoch = procs[p]->name.epoch;
|
||||
|
||||
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
|
||||
}
|
||||
@ -657,6 +663,7 @@ static int global_refresh_job_structs(void)
|
||||
orte_std_cntr_t i = 0;
|
||||
orte_vpid_t p = 0;
|
||||
bool found = false;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* look up job data object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) {
|
||||
@ -793,6 +800,7 @@ static int global_refresh_job_structs(void)
|
||||
|
||||
app_snapshot->process_name.jobid = procs[p]->name.jobid;
|
||||
app_snapshot->process_name.vpid = procs[p]->name.vpid;
|
||||
app_snapshot->process_name.epoch = procs[p]->name.epoch;
|
||||
|
||||
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
|
||||
}
|
||||
@ -808,9 +816,12 @@ static int global_refresh_job_structs(void)
|
||||
|
||||
orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid;
|
||||
orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid;
|
||||
orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch;
|
||||
|
||||
if( orted_snapshot->process_name.jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
orted_snapshot->process_name.vpid == ORTE_PROC_MY_NAME->vpid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, &orted_snapshot->process_name, ORTE_PROC_MY_NAME)) {
|
||||
global_coord_has_local_children = true;
|
||||
}
|
||||
for(p = 0; p < cur_node->num_procs; ++p) {
|
||||
@ -826,6 +837,7 @@ static int global_refresh_job_structs(void)
|
||||
|
||||
app_snapshot->process_name.jobid = procs[p]->name.jobid;
|
||||
app_snapshot->process_name.vpid = procs[p]->name.vpid;
|
||||
app_snapshot->process_name.epoch = procs[p]->name.epoch;
|
||||
|
||||
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
|
||||
}
|
||||
@ -2375,14 +2387,17 @@ static orte_snapc_full_orted_snapshot_t *find_orted_snapshot(orte_process_name_t
|
||||
|
||||
orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL;
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(&(global_snapshot.local_snapshots));
|
||||
item != opal_list_get_end(&(global_snapshot.local_snapshots));
|
||||
item = opal_list_get_next(item) ) {
|
||||
orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item;
|
||||
|
||||
if( name->jobid == orted_snapshot->process_name.jobid &&
|
||||
name->vpid == orted_snapshot->process_name.vpid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, name, &orted_snapshot->process_name)) {
|
||||
return orted_snapshot;
|
||||
}
|
||||
}
|
||||
@ -2404,8 +2419,10 @@ static orte_snapc_full_orted_snapshot_t *find_orted_snapshot(orte_process_name_t
|
||||
item = opal_list_get_next(item) ) {
|
||||
orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item;
|
||||
|
||||
if( name->jobid == orted_snapshot->process_name.jobid &&
|
||||
name->vpid == orted_snapshot->process_name.vpid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, name, &orted_snapshot->process_name)) {
|
||||
return orted_snapshot;
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
@ -2033,6 +2033,7 @@ static int snapc_full_local_get_vpids(void)
|
||||
vpid_snapshot->process_pid = child->pid;
|
||||
vpid_snapshot->super.process_name.jobid = child->name->jobid;
|
||||
vpid_snapshot->super.process_name.vpid = child->name->vpid;
|
||||
vpid_snapshot->super.process_name.epoch = child->name->epoch;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2094,6 +2095,7 @@ static int snapc_full_local_refresh_vpids(void)
|
||||
vpid_snapshot->process_pid = child->pid;
|
||||
vpid_snapshot->super.process_name.jobid = child->name->jobid;
|
||||
vpid_snapshot->super.process_name.vpid = child->name->vpid;
|
||||
vpid_snapshot->super.process_name.epoch = child->name->epoch;
|
||||
/*vpid_snapshot->migrating = true;*/
|
||||
|
||||
opal_list_append(&(local_global_snapshot.local_snapshots), &(vpid_snapshot->super.super));
|
||||
@ -2109,6 +2111,7 @@ static int snapc_full_local_refresh_vpids(void)
|
||||
vpid_snapshot->process_pid = child->pid;
|
||||
vpid_snapshot->super.process_name.jobid = child->name->jobid;
|
||||
vpid_snapshot->super.process_name.vpid = child->name->vpid;
|
||||
vpid_snapshot->super.process_name.epoch = child->name->epoch;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2119,14 +2122,17 @@ static orte_snapc_full_app_snapshot_t *find_vpid_snapshot(orte_process_name_t *n
|
||||
{
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_snapc_full_app_snapshot_t *vpid_snapshot = NULL;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(&(local_global_snapshot.local_snapshots));
|
||||
item != opal_list_get_end(&(local_global_snapshot.local_snapshots));
|
||||
item = opal_list_get_next(item) ) {
|
||||
vpid_snapshot = (orte_snapc_full_app_snapshot_t*)item;
|
||||
|
||||
if( name->jobid == vpid_snapshot->super.process_name.jobid &&
|
||||
name->vpid == vpid_snapshot->super.process_name.vpid ) {
|
||||
mask = ORTE_NS_CMP_JOBID;
|
||||
|
||||
if (OPAL_EQUAL ==
|
||||
orte_util_compare_name_fields(mask, name, &vpid_snapshot->super.process_name)) {
|
||||
return vpid_snapshot;
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
@ -83,6 +83,7 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
|
||||
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_name.epoch = 0;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
}
|
||||
@ -90,6 +91,7 @@ void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot)
|
||||
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_name.epoch = 0;
|
||||
|
||||
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -59,6 +62,7 @@ void orte_sstore_base_local_snapshot_info_construct(orte_sstore_base_local_snaps
|
||||
{
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
snapshot->crs_comp = NULL;
|
||||
snapshot->compress_comp = NULL;
|
||||
@ -72,6 +76,7 @@ void orte_sstore_base_local_snapshot_info_destruct( orte_sstore_base_local_snaps
|
||||
{
|
||||
snapshot->process_name.jobid = 0;
|
||||
snapshot->process_name.vpid = 0;
|
||||
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
if( NULL != snapshot->crs_comp ) {
|
||||
free(snapshot->crs_comp);
|
||||
@ -632,6 +637,7 @@ int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_in
|
||||
|
||||
vpid_snapshot->process_name.jobid = proc.jobid;
|
||||
vpid_snapshot->process_name.vpid = proc.vpid;
|
||||
vpid_snapshot->process_name.epoch = proc.epoch;
|
||||
}
|
||||
else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) {
|
||||
vpid_snapshot->crs_comp = strdup(value);
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1212,6 +1215,7 @@ static int orte_sstore_central_extract_global_metadata(orte_sstore_central_globa
|
||||
|
||||
vpid_snapshot->process_name.jobid = handle_info->jobid;
|
||||
vpid_snapshot->process_name.vpid = i;
|
||||
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
|
||||
|
||||
vpid_snapshot->crs_comp = NULL;
|
||||
global_snapshot->start_time = NULL;
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -207,6 +210,7 @@ void orte_sstore_central_local_app_snapshot_info_construct(orte_sstore_central_l
|
||||
{
|
||||
info->name.jobid = ORTE_JOBID_INVALID;
|
||||
info->name.vpid = ORTE_VPID_INVALID;
|
||||
info->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
info->local_location = NULL;
|
||||
info->metadata_filename = NULL;
|
||||
@ -218,6 +222,7 @@ void orte_sstore_central_local_app_snapshot_info_destruct( orte_sstore_central_l
|
||||
{
|
||||
info->name.jobid = ORTE_JOBID_INVALID;
|
||||
info->name.vpid = ORTE_VPID_INVALID;
|
||||
info->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
if( NULL != info->local_location ) {
|
||||
free(info->local_location);
|
||||
@ -530,6 +535,7 @@ static int append_new_app_handle_info(orte_sstore_central_local_snapshot_info_t
|
||||
|
||||
app_info->name.jobid = name->jobid;
|
||||
app_info->name.vpid = name->vpid;
|
||||
app_info->name.epoch = name->epoch;
|
||||
|
||||
opal_list_append(handle_info->app_info_handle, &(app_info->super));
|
||||
|
||||
@ -541,14 +547,16 @@ static orte_sstore_central_local_app_snapshot_info_t *find_app_handle_info(orte_
|
||||
{
|
||||
orte_sstore_central_local_app_snapshot_info_t *app_info = NULL;
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_bs_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(handle_info->app_info_handle);
|
||||
item != opal_list_get_end(handle_info->app_info_handle);
|
||||
item = opal_list_get_next(item) ) {
|
||||
app_info = (orte_sstore_central_local_app_snapshot_info_t*)item;
|
||||
|
||||
if( app_info->name.jobid == name->jobid &&
|
||||
app_info->name.vpid == name->vpid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &app_info->name, name)) {
|
||||
return app_info;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1214,8 +1217,10 @@ static int process_local_push(orte_process_name_t* peer, opal_buffer_t* buffer,
|
||||
p_set = OBJ_NEW(orte_filem_base_process_set_t);
|
||||
p_set->source.jobid = peer->jobid;
|
||||
p_set->source.vpid = peer->vpid;
|
||||
p_set->source.epoch = peer->epoch;
|
||||
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
|
||||
}
|
||||
|
||||
@ -1700,6 +1705,7 @@ static int orte_sstore_stage_extract_global_metadata(orte_sstore_stage_global_sn
|
||||
|
||||
vpid_snapshot->process_name.jobid = handle_info->jobid;
|
||||
vpid_snapshot->process_name.vpid = i;
|
||||
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
|
||||
|
||||
/* JJH: Currently we do not have this information since we do not save
|
||||
* individual vpid info in the Global SStore. It is in the metadata
|
||||
|
@ -1,6 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -284,6 +287,7 @@ void orte_sstore_stage_local_app_snapshot_info_construct(orte_sstore_stage_local
|
||||
{
|
||||
info->name.jobid = ORTE_JOBID_INVALID;
|
||||
info->name.vpid = ORTE_VPID_INVALID;
|
||||
info->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
info->local_location = NULL;
|
||||
info->compressed_local_location = NULL;
|
||||
@ -298,6 +302,7 @@ void orte_sstore_stage_local_app_snapshot_info_destruct( orte_sstore_stage_local
|
||||
{
|
||||
info->name.jobid = ORTE_JOBID_INVALID;
|
||||
info->name.vpid = ORTE_VPID_INVALID;
|
||||
info->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
if( NULL != info->local_location ) {
|
||||
free(info->local_location);
|
||||
@ -1009,6 +1014,7 @@ static int append_new_app_handle_info(orte_sstore_stage_local_snapshot_info_t *h
|
||||
|
||||
app_info->name.jobid = name->jobid;
|
||||
app_info->name.vpid = name->vpid;
|
||||
app_info->name.epoch = name->epoch;
|
||||
|
||||
opal_list_append(handle_info->app_info_handle, &(app_info->super));
|
||||
|
||||
@ -1020,14 +1026,16 @@ static orte_sstore_stage_local_app_snapshot_info_t *find_app_handle_info(orte_ss
|
||||
{
|
||||
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
|
||||
opal_list_item_t* item = NULL;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
for(item = opal_list_get_first(handle_info->app_info_handle);
|
||||
item != opal_list_get_end(handle_info->app_info_handle);
|
||||
item = opal_list_get_next(item) ) {
|
||||
app_info = (orte_sstore_stage_local_app_snapshot_info_t*)item;
|
||||
|
||||
if( app_info->name.jobid == name->jobid &&
|
||||
app_info->name.vpid == name->vpid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &app_info->name, name)) {
|
||||
return app_info;
|
||||
}
|
||||
}
|
||||
@ -2049,14 +2057,17 @@ static int orte_sstore_stage_local_preload_files(char **local_location, bool *sk
|
||||
/* if I am the HNP, then use me as the source */
|
||||
p_set->source.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
p_set->source.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
p_set->source.epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
}
|
||||
else {
|
||||
/* otherwise, set the HNP as the source */
|
||||
p_set->source.jobid = ORTE_PROC_MY_HNP->jobid;
|
||||
p_set->source.vpid = ORTE_PROC_MY_HNP->vpid;
|
||||
p_set->source.epoch = ORTE_PROC_MY_HNP->epoch;
|
||||
}
|
||||
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
|
||||
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
|
||||
|
||||
/* Define the file set */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -27,6 +27,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -50,4 +51,7 @@ ORTE_DECLSPEC extern struct timeval orte_daemon_msg_recvd;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
/* Local function */
|
||||
int send_to_local_applications(opal_pointer_array_t *dead_names);
|
||||
|
||||
#endif /* ORTED_H */
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
x
Ссылка в новой задаче
Block a user