1
1

Add a resilience to ORTE. Allows the runtime to continue after a process (or

ORTED) failure. Note that more work will be necessary to allow the MPI layer to
take advantage of this.

Per RFC:
http://www.open-mpi.org/community/lists/devel/2011/06/9299.php

This commit was SVN r24815.
Этот коммит содержится в:
Wesley Bland 2011-06-23 20:38:02 +00:00
родитель e8817f3f63
Коммит e1ba09ad51
130 изменённых файлов: 2783 добавлений и 709 удалений

Просмотреть файл

@ -85,6 +85,7 @@ tprins Tim Prins IU, LANL
twoodall Tim Woodall LANL
vasily Vasily Filipov Mellanox
vsahay Vishal Sahay IU
wbland Wesley Bland UTK
yuw Weikuan Yu LANL, OSU
Affiliaion abbreviations:

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -234,6 +234,12 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
return new_errhandler;
}
/**
* Runtime errhandler callback
*/
void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs) {
ompi_mpi_abort(MPI_COMM_WORLD, 1, false);
}
/**************************************************************************
*

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -35,6 +35,8 @@
#include "ompi/errhandler/errhandler_predefined.h"
#include "ompi/errhandler/errcode-internal.h"
#include "orte/types.h"
BEGIN_C_DECLS
/*
@ -358,6 +360,19 @@ struct ompi_request_t;
OMPI_DECLSPEC ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
ompi_errhandler_generic_handler_fn_t *func,
ompi_errhandler_lang_t language);
/**
* Callback function from runtime layer to alert the MPI layer of an error at
* the runtime layer.
*
* @param procs The names of the processes that have failed.
*
* This function is used to alert the MPI layer to a specific fault at the
* runtime layer. Currently, the only faults reported using this method are
* process failures. The MPI layer has the option to perform whatever actions it
* needs to stabalize itself and continue running, abort, etc.
*/
OMPI_DECLSPEC void ompi_errhandler_runtime_callback(opal_pointer_array_t *procs);
/**
* Check to see if an errhandler is intrinsic.

Просмотреть файл

@ -660,8 +660,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
bool found = false;
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
"jobid %d, vpid %d, sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid, subnet_id, lid));
"jobid %d, vpid %d, epoch %d, sid %" PRIx64 ", lid %d",
process_name->jobid, process_name->vpid, process_name->epoch, subnet_id, lid));
/* find ibproc */
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
for (ib_proc = (mca_btl_openib_proc_t*)

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -1208,6 +1208,7 @@ mca_coll_sm2_comm_query(struct ompi_communicator_t *comm, int *priority)
peer = OBJ_NEW(orte_namelist_t);
peer->name.jobid = comm->c_local_group->grp_proc_pointers[i]->proc_name.jobid;
peer->name.vpid = comm->c_local_group->grp_proc_pointers[i]->proc_name.vpid;
peer->name.epoch = comm->c_local_group->grp_proc_pointers[i]->proc_name.epoch;
opal_list_append(&peers, &peer->item);
}
/* prepare send data */

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 The University of Tennessee and The University
* Copyright (c) 2010-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
@ -35,6 +35,7 @@
#include "orte/util/name_fns.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/ess/ess.h"
#include "ompi/request/request.h"
#include "ompi/mca/dpm/dpm.h"
@ -701,6 +702,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_rev->proc_name.epoch = ORTE_EPOCH_INVALID;
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
@ -728,6 +730,7 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
peer_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
HOKE_TRAFFIC_MSG_REF_RETURN(item);
@ -837,6 +840,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -864,6 +868,7 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->matched = INVALID_INT;
msg_ref->done = INVALID_INT;
@ -897,6 +902,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -928,6 +934,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
msg_ref->proc_name.epoch = ORTE_EPOCH_INVALID;
msg_ref->done = INVALID_INT;
msg_ref->active = INVALID_INT;
@ -947,6 +954,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
}
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
@ -954,6 +962,7 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
msg_ack_ref->peer.epoch = ORTE_EPOCH_INVALID;
}
@ -1006,7 +1015,7 @@ do { \
}
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
#define CREATE_NEW_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
{ \
HOKE_TRAFFIC_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1025,6 +1034,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
\
msg_ref->matched = 0; \
msg_ref->done = 0; \
@ -1033,7 +1043,7 @@ do { \
msg_ref->active_drain = 0; \
}
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid) \
#define CREATE_NEW_DRAIN_MSG(msg_ref, v_type, v_count, v_ddt_size, v_tag, v_rank, v_comm, p_jobid, p_vpid, p_epoch) \
{ \
HOKE_DRAIN_MSG_REF_ALLOC(msg_ref, ret); \
\
@ -1053,6 +1063,7 @@ do { \
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
msg_ref->proc_name.epoch = p_epoch; \
}
@ -1455,6 +1466,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
new_peer_ref->proc_name.epoch = procs[i]->proc_name.epoch;
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
}
@ -3225,7 +3237,8 @@ static int traffic_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
peer_ref->proc_name.vpid
peer_ref->proc_name.epoch);
} else {
CREATE_NEW_MSG((*msg_ref), msg_type,
count, ddt_size, tag, dest, comm,
@ -3364,6 +3377,7 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
(*new_msg_ref)->proc_name.epoch = to_peer_ref->proc_name.epoch;
}
return exit_status;
@ -3794,7 +3808,8 @@ static int drain_message_append(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
CREATE_NEW_DRAIN_MSG((*msg_ref), msg_type,
count, NULL, tag, dest, comm,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
peer_ref->proc_name.vpid
peer_ref->proc_name.epoch);
(*msg_ref)->done = 0;
(*msg_ref)->active = 0;
@ -4142,6 +4157,7 @@ static int drain_message_copy_remove(ompi_crcp_bkmrk_pml_drain_message_ref_t *dr
static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
{
opal_list_item_t* item = NULL;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(&ompi_crcp_bkmrk_pml_peer_refs);
item != opal_list_get_end(&ompi_crcp_bkmrk_pml_peer_refs);
@ -4149,7 +4165,9 @@ static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(orte_process_name_t proc)
ompi_crcp_bkmrk_pml_peer_ref_t *cur_peer_ref;
cur_peer_ref = (ompi_crcp_bkmrk_pml_peer_ref_t*)item;
if( OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
if( OPAL_EQUAL == orte_util_compare_name_fields(mask,
&(cur_peer_ref->proc_name),
&proc) ) {
return cur_peer_ref;
@ -5266,6 +5284,7 @@ static int send_bookmarks(int peer_idx)
*/
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
if( NULL == (peer_ref = find_peer(peer_name))) {
opal_output(mca_crcp_bkmrk_component.super.output_handle,
@ -5326,6 +5345,7 @@ static int recv_bookmarks(int peer_idx)
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
peer_name.vpid = peer_idx;
peer_name.epoch = orte_ess.proc_get_epoch(&peer_name);
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
OMPI_CRCP_COORD_BOOKMARK_TAG,
@ -5507,6 +5527,7 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
d_msg_ack->peer.epoch = peer_ref->proc_name.epoch;
d_msg_ack->complete = false;
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
OPAL_OUTPUT_VERBOSE((10, mca_crcp_bkmrk_component.super.output_handle,
@ -6146,7 +6167,8 @@ static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t *peer_r
count, datatype_size, tag, rank,
ompi_comm_lookup(comm_id),
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
peer_ref->proc_name.vpid
peer_ref->proc_name.epoch);
traffic_message_create_drain_message(true, num_left_unresolved,
peer_ref,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -1130,6 +1130,7 @@ static void process_cb(int fd, short event, void *data)
/* flag the identity of the remote proc */
carport.jobid = mev->sender.jobid;
carport.vpid = mev->sender.vpid;
carport.epoch = mev->sender.epoch;
/* release the event */
OBJ_RELEASE(mev);

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -395,12 +398,13 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
orte_proc.epoch = hdr->hdr_restart.hdr_epoch;
ompi_proc = ompi_proc_find(&orte_proc);
opal_output_verbose(20, mca_pml_bfo_output,
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
"hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
"hdr->hdr_vpid=%d, hdr->hdr_epoch=%d, ompi_proc->proc_hostname=%s",
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
@ -408,8 +412,8 @@ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl
hdr->hdr_restart.hdr_restartseq,
recvreq->remote_req_send.pval, (void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
ompi_proc->proc_hostname);
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
hdr->hdr_restart.hdr_epoch, ompi_proc->proc_hostname);
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
return;
}
@ -711,6 +715,7 @@ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* send
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
restart->hdr_epoch = ORTE_PROC_MY_NAME->epoch;
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -415,6 +415,7 @@ struct mca_pml_bfo_restart_hdr_t {
int32_t hdr_dst_rank; /**< needed to send NACK */
uint32_t hdr_jobid; /**< needed to send NACK */
uint32_t hdr_vpid; /**< needed to send NACK */
uint32_t hdr_epoch; /**< needed to send NACK */
};
typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
@ -427,6 +428,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
(h).hdr_jobid = ntohl((h).hdr_jobid); \
(h).hdr_vpid = ntohl((h).hdr_vpid); \
(h).hdr_epoch = ntohl((h).hdr_epoch); \
} while (0)
#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
@ -435,6 +437,7 @@ typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
(h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
(h).hdr_jobid = htonl((h).hdr_jobid); \
(h).hdr_vpid = htonl((h).hdr_vpid); \
(h).hdr_epoch = htonl((h).hdr_epoch); \
} while (0)
#endif /* PML_BFO */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -108,6 +108,7 @@ int ompi_proc_init(void)
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.vpid = i;
proc->proc_name.epoch = ORTE_EPOCH_MIN;
if (i == ORTE_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->proc_flags = OPAL_PROC_ALL_LOCAL;
@ -361,6 +362,8 @@ int ompi_proc_refresh(void) {
/* Does not change: proc->proc_name.vpid */
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->proc_name.epoch = orte_ess.proc_get_epoch(&proc->proc_name);
/* Make sure to clear the local flag before we set it below */
proc->proc_flags = 0;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -67,6 +67,7 @@
#include "ompi/communicator/communicator.h"
#include "ompi/info/info.h"
#include "ompi/errhandler/errcode.h"
#include "ompi/errhandler/errhandler.h"
#include "ompi/request/request.h"
#include "ompi/op/op.h"
#include "ompi/mca/op/op.h"
@ -369,6 +370,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
gettimeofday(&ompistart, NULL);
}
/* Register errhandler callback with orte errmgr */
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
/* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow
MPI_THREAD_MULTIPLE. Set this stuff up here early in the

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -81,28 +81,36 @@ typedef uint32_t orte_vpid_t;
#define ORTE_VPID_T OPAL_UINT32
#define ORTE_VPID_MAX UINT32_MAX-2
#define ORTE_VPID_MIN 0
typedef uint32_t orte_epoch_t;
#define ORTE_EPOCH_T OPAL_UINT32
#define ORTE_EPOCH_MAX UINT32_MAX-2
#define ORTE_EPOCH_MIN 0
#define ORTE_PROCESS_NAME_HTON(n) \
do { \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid); \
n.epoch = htonl(n.epoch); \
} while (0)
#define ORTE_PROCESS_NAME_NTOH(n) \
do { \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid); \
n.epoch = ntohl(n.epoch); \
} while (0)
#define ORTE_NAME_ARGS(n) \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid)
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch)
/*
* define invalid values
*/
#define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2)
#define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2)
#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2)
#define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF)
/*
@ -110,6 +118,7 @@ do { \
*/
#define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1)
#define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1)
#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1)
#define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF)
/*
@ -118,6 +127,14 @@ do { \
struct orte_process_name_t {
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process id - equivalent to rank */
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
* The epoch will start at ORTE_EPOCH_MIN and
* increment every time the process is detected as
* having stopped (including normal shutdown). The
* HNP will be responsible for informing all
* processes that did not directly detect the
* failure to increment their epochs.
*/
};
typedef struct orte_process_name_t orte_process_name_t;
@ -140,35 +157,35 @@ typedef void* orte_iov_base_ptr_t;
#define ORTE_NAME (OPAL_DSS_ID_DYNAMIC + 2) /**< an orte_process_name_t */
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
#if !ORTE_DISABLE_FULL_SUPPORT
/* State-related types */
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 5) /**< node status flag */
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< process/resource status */
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< job status flag */
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 8) /**< process exit code */
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */
#define ORTE_PROC_STATE (OPAL_DSS_ID_DYNAMIC + 7) /**< process/resource status */
#define ORTE_JOB_STATE (OPAL_DSS_ID_DYNAMIC + 8) /**< job status flag */
#define ORTE_EXIT_CODE (OPAL_DSS_ID_DYNAMIC + 9) /**< process exit code */
/* Data-passing types */
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 9) /**< registry return value */
#define ORTE_VALUE (OPAL_DSS_ID_DYNAMIC + 10) /**< registry return value */
/* Resource types */
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 10) /**< argv and enviro arrays */
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 11) /**< describes capabilities of nodes */
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes slot allocations/reservations */
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 13) /**< job information */
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 14) /**< node information */
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 15) /**< process information */
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 16) /**< map of process locations */
#define ORTE_APP_CONTEXT (OPAL_DSS_ID_DYNAMIC + 11) /**< argv and enviro arrays */
#define ORTE_NODE_DESC (OPAL_DSS_ID_DYNAMIC + 12) /**< describes capabilities of nodes */
#define ORTE_SLOT_DESC (OPAL_DSS_ID_DYNAMIC + 13) /**< describes slot allocations/reservations */
#define ORTE_JOB (OPAL_DSS_ID_DYNAMIC + 14) /**< job information */
#define ORTE_NODE (OPAL_DSS_ID_DYNAMIC + 15) /**< node information */
#define ORTE_PROC (OPAL_DSS_ID_DYNAMIC + 16) /**< process information */
#define ORTE_JOB_MAP (OPAL_DSS_ID_DYNAMIC + 17) /**< map of process locations */
/* RML types */
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 17) /**< tag for sending/receiving messages */
#define ORTE_RML_TAG (OPAL_DSS_ID_DYNAMIC + 18) /**< tag for sending/receiving messages */
/* DAEMON command type */
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 18) /**< command flag for communicating with the daemon */
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */
/* GRPCOMM types */
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 19)
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20)
/* IOF types */
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20)
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21)
/* provide a boundary for others to use */

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -383,6 +386,7 @@ static void recv_cmd(int status,
dat = OBJ_NEW(orte_db_data_t);
dat->name.jobid = sender->jobid;
dat->name.vpid = sender->vpid;
dat->name.epoch= sender->epoch;
dat->key = key;
count=1;
opal_dss.unpack(buf, &dat->size, &count, OPAL_INT32);

Просмотреть файл

@ -1,9 +1,13 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,11 +26,15 @@
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -48,9 +56,22 @@ static int update_state(orte_jobid_t job,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
@ -64,7 +85,12 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
};
/************************
@ -87,6 +113,8 @@ static int update_state(orte_jobid_t job,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",
@ -104,9 +132,9 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
@ -120,6 +148,95 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
@ -161,7 +278,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
goto cleanup;
}
cleanup:
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -105,6 +105,7 @@ ORTE_DECLSPEC void orte_errmgr_base_proc_state_notify(orte_proc_state_t state, o
/*
* Additional External API function declared in errmgr.h
*/
ORTE_DECLSPEC orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc);
END_C_DECLS

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -97,11 +97,13 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
}
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
}
@ -137,11 +139,13 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
item->node_name = NULL;
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
item->map_node_name = NULL;
@ -152,6 +156,7 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
{
item->proc_name.vpid = ORTE_VPID_INVALID;
item->proc_name.epoch = ORTE_EPOCH_INVALID;
item->proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->node_name ) {
@ -160,6 +165,7 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
}
item->map_proc_name.vpid = ORTE_VPID_INVALID;
item->map_proc_name.epoch = ORTE_EPOCH_INVALID;
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
if( NULL != item->map_node_name ) {
@ -678,6 +684,18 @@ int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_
#endif
orte_errmgr_fault_callback_t *orte_errmgr_base_set_fault_callback(orte_errmgr_fault_callback_t *cbfunc) {
orte_errmgr_fault_callback_t *temp_cbfunc = fault_cbfunc;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s errmgr:base Called set_fault_callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
fault_cbfunc = cbfunc;
return temp_cbfunc;
}
/********************
* Local Functions
********************/

Просмотреть файл

@ -55,6 +55,8 @@ orte_errmgr_base_t orte_errmgr_base;
orte_errmgr_base_component_t orte_errmgr_base_selected_component;
orte_errmgr_fault_callback_t *fault_cbfunc;
/* Public module provides a wrapper around previous functions */
orte_errmgr_base_module_t orte_errmgr = {
NULL, /* init */

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* All rights reserved.
*
* $COPYRIGHT$
@ -264,6 +267,7 @@ static int errmgr_base_tool_start_cmdline_listener(void)
*/
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
errmgr_cmdline_sender.epoch = ORTE_EPOCH_INVALID;
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_MIGRATE,
0,
@ -375,12 +379,14 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
swap_dest.jobid = errmgr_cmdline_sender.jobid;
swap_dest.vpid = errmgr_cmdline_sender.vpid;
swap_dest.epoch = errmgr_cmdline_sender.epoch;
errmgr_cmdline_sender = *sender;
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
errmgr_cmdline_sender.jobid = swap_dest.jobid;
errmgr_cmdline_sender.vpid = swap_dest.vpid;
errmgr_cmdline_sender.epoch = swap_dest.epoch;
goto cleanup;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -49,6 +49,7 @@
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/util/output.h"
#include "opal/util/error.h"
#include "opal/util/opal_sos.h"
@ -90,6 +91,22 @@ struct orte_errmgr_predicted_node_t {
typedef struct orte_errmgr_predicted_node_t orte_errmgr_predicted_node_t;
OBJ_CLASS_DECLARATION(orte_errmgr_predicted_node_t);
/*
* Callback function that should be called when there is a fault.
*
* This callback function will be used anytime (other than during finalize) the
* runtime detects and handles a process failure. The runtime will complete all
* its stabilization before alerting the callback function. The parameter to the
* callback function will be the orte_process_name_t of the process that failed.
* It will not alert the application to failures that are not in the same job as
* the alerted process, only failures within the same jobid.
*
* @param[in] proc The names of the process that failed
*/
typedef void (orte_errmgr_fault_callback_t)(opal_pointer_array_t *procs);
ORTE_DECLSPEC extern orte_errmgr_fault_callback_t *fault_cbfunc;
/*
* Structure to describe a suggested remapping element for a predicted fault.
*
@ -242,42 +259,100 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
/**
* Register a callback to alert caller when ORTE is preparing to
* migrate the process to another location. This provides an
* opportunity for the process to checkpoint any required state,
* and to cleanly shutdown.
* Function to perform actions that require the rest of the ORTE layer to be up
* and running.
*
* @param[in] delay Time to delay before assuming process is stuck
* and cannot exit on its own - and thus, go
* ahead and migrate it
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecified error occured
*/
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
/*
* This function gets called just after startup is finished. It gives the errmgr
* a chance to setup anything that requires ORTE to actually be ready to go such
* as registering callbacks, posting receives, etc.
*/
typedef int (*orte_errmgr_base_module_post_startup_t)(void);
/*
* This function gets called just before shutdown begins. It gives the errmgr a
* chance to clean up anything that it did after startup, i.e. deregistering
* callbacks, cleaning up receives, etc.
*/
typedef int (*orte_errmgr_base_module_pre_shutdown_t)(void);
/**
* Function to mark a list of processes as dead and perform any internal cleanup
* necessary.
*
* @param[in] dead_procs Process list that is being marked as dead.
*
* @retval ORTE_SUCCESS The operation completed successfully.
* @retval ORTE_ERROR An unspecified error occurred.
*/
typedef int (*orte_errmgr_base_module_mark_processes_as_dead_t)(opal_pointer_array_t *dead_procs);
/**
* Set the callback function for faults.
*
* @param[in] cbfunc The callback function.
*
* @retval The previous fault callback function.
*/
typedef orte_errmgr_fault_callback_t *(*orte_errmgr_base_module_set_fault_callback_t)(orte_errmgr_fault_callback_t *cbfunc);
/**
* Receive updates about failure notifications.
*
* @param[in] sender The process who originally sent the failure notification.
* @param[in] buffer The buffer containing all the information about the failed process.
*
* @retval ORTE_SUCCESS The operation completed successfully.
* @retval ORTE_ERROR An unspecified error occurred.
*/
typedef int (*orte_errmgr_base_module_failure_notification_t)(orte_process_name_t *sender,
opal_buffer_t *buffer);
/*
* Module Structure
*/
struct orte_errmgr_base_module_2_3_0_t {
/** Initialization Function */
orte_errmgr_base_module_init_fn_t init;
orte_errmgr_base_module_init_fn_t init;
/** Finalization Function */
orte_errmgr_base_module_finalize_fn_t finalize;
orte_errmgr_base_module_finalize_fn_t finalize;
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
/** Actual process failure notification */
orte_errmgr_base_module_update_state_fn_t update_state;
orte_errmgr_base_module_update_state_fn_t update_state;
/** Predicted process/node failure notification */
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
orte_errmgr_base_module_predicted_fault_fn_t predicted_fault;
/** Suggest a node to map a restarting process onto */
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
orte_errmgr_base_module_suggest_map_targets_fn_t suggest_map_targets;
/** Handle any FT Notifications */
orte_errmgr_base_module_ft_event_fn_t ft_event;
orte_errmgr_base_module_ft_event_fn_t ft_event;
/* Register to be warned of impending migration */
/* Register to be warned of impending migration */
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
/** Perform post-statup operations */
orte_errmgr_base_module_post_startup_t post_startup;
/** Perform pre-shutdown operations */
orte_errmgr_base_module_pre_shutdown_t pre_shutdown;
/* Mark a process as dead. */
orte_errmgr_base_module_mark_processes_as_dead_t mark_processes_as_dead;
/* Set the callback function */
orte_errmgr_base_module_set_fault_callback_t set_fault_callback;
/* Receive failure notification */
orte_errmgr_base_module_failure_notification_t failure_notification;
};
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -57,10 +60,6 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
@ -81,6 +80,10 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_global_ft_event(int state);
int orte_errmgr_hnp_global_post_startup(void);
int orte_errmgr_hnp_global_pre_shutdown(void);
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
/* HNP Versions */
int orte_errmgr_hnp_base_global_init(void);

Просмотреть файл

@ -2,6 +2,9 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -391,6 +394,7 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *node = NULL;
bool found = false;
int num_removed = 0, num_to_remove;
orte_ns_cmp_bitmask_t mask;
if( NULL == current_global_jobdata ) {
return ORTE_SUCCESS;
@ -410,8 +414,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
item = opal_list_get_next(item) ) {
wp_item = (errmgr_autor_wp_item_t*)item;
if( wp_item->name.vpid == proc->name.vpid &&
wp_item->name.jobid == proc->name.jobid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) {
found = true;
break;
}
@ -518,6 +522,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
wp_item->name.jobid = proc->jobid;
wp_item->name.vpid = proc->vpid;
wp_item->name.epoch = proc->epoch;
wp_item->state = state;
opal_list_append(procs_pending_recovery, &(wp_item->super));
@ -621,6 +626,7 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
wp->state = 0;
}
@ -629,6 +635,7 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
wp->state = 0;
}

Просмотреть файл

@ -2,6 +2,9 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -747,6 +750,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -803,6 +807,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -850,6 +855,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}

Просмотреть файл

@ -3,6 +3,9 @@
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,9 +32,11 @@
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
@ -53,8 +58,9 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static int record_dead_process(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
/*
* Module functions: Global
@ -79,7 +85,11 @@ static int suggest_map_targets(orte_proc_t *proc,
static int ft_event(int state);
static int post_startup(void);
static int pre_shutdown(void);
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
/******************
* ORTED module
@ -94,7 +104,12 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
predicted_fault,
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
mark_processes_as_dead,
orte_errmgr_base_set_fault_callback, /* Set callback function */
failure_notification
};
/************************
@ -125,20 +140,29 @@ static int update_state(orte_jobid_t job,
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_app_context_t *app;
orte_ns_cmp_bitmask_t mask;
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:orted:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((NULL == proc) ? "App. Process" :
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
/* if this is a heartbeat failure, let the HNP handle it */
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
return ORTE_SUCCESS;
}
/*** UPDATE COMMAND FOR A JOB ***/
if (NULL == proc) {
/* this is an update for an entire job */
@ -175,7 +199,7 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == job) {
break;
@ -184,7 +208,7 @@ static int update_state(orte_jobid_t job,
if (NULL == jobdat) {
return ORTE_ERR_NOT_FOUND;
}
switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jobdat, exit_code);
@ -197,10 +221,10 @@ static int update_state(orte_jobid_t job,
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
break;
@ -237,15 +261,16 @@ static int update_state(orte_jobid_t job,
* lifeline
*/
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
/* terminate - our routed children will see
* us leave and automatically die
*/
@ -256,21 +281,25 @@ static int update_state(orte_jobid_t job,
/* was it a daemon that failed? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
/* if all my routes are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes() &&
0 == opal_list_get_size(&orte_local_children)) {
orte_quit();
}
}
record_dead_process(proc);
/* if not, then indicate we can continue */
return ORTE_SUCCESS;
}
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
@ -280,7 +309,7 @@ static int update_state(orte_jobid_t job,
/* must already be complete */
return ORTE_SUCCESS;
}
/* if there are no local procs for this job, we can
* ignore this call
*/
@ -301,15 +330,15 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid);
killprocs(proc->jobid, proc->vpid, proc->epoch);
}
app = jobdat->apps[child->app_idx];
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
@ -324,7 +353,7 @@ static int update_state(orte_jobid_t job,
}
}
}
if (ORTE_PROC_STATE_TERMINATED < state) {
if( jobdat->enable_recovery ) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -335,8 +364,8 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
/* see if this child has reached its local restart limit */
app = jobdat->apps[child->app_idx];
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -363,8 +392,8 @@ static int update_state(orte_jobid_t job,
}
}
}
REPORT_ABORT:
REPORT_ABORT:
/* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away
*/
@ -387,8 +416,8 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
@ -402,7 +431,7 @@ static int update_state(orte_jobid_t job,
opal_list_remove_item(&orte_local_children, &child->super);
/* Decrement the number of local procs */
jobdat->num_local_procs--;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -424,15 +453,15 @@ static int update_state(orte_jobid_t job,
OBJ_DESTRUCT(&alert);
return rc;
}
/* find this proc in the local children so we can update its state */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
if (0 < pid) {
@ -452,7 +481,7 @@ static int update_state(orte_jobid_t job,
* the HNP so it is available to debuggers and anyone
* else that needs it
*/
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted: sending contact info to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -469,7 +498,7 @@ static int update_state(orte_jobid_t job,
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack all the local child vpids */
/* pack all the local child vpids and epochs */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -479,6 +508,10 @@ static int update_state(orte_jobid_t job,
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->epoch, 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
}
}
/* pack an invalid marker */
@ -502,7 +535,7 @@ static int update_state(orte_jobid_t job,
}
return rc;
}
/* only other state is terminated - see if anyone is left alive */
if (!any_live_children(proc->jobid)) {
/* lookup the local jobdat for this job */
@ -511,7 +544,7 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
@ -533,8 +566,8 @@ static int update_state(orte_jobid_t job,
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) {
ORTE_ERROR_LOG(rc);
}
FINAL_CLEANUP:
FINAL_CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -548,7 +581,7 @@ static int update_state(orte_jobid_t job,
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
if (jobdat->jobid == child->name->jobid) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
@ -557,11 +590,11 @@ static int update_state(orte_jobid_t job,
/* ensure the job's local session directory tree is removed */
orte_session_dir_cleanup(jobdat->jobid);
/* remove this job from our local job data since it is complete */
opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
OBJ_RELEASE(jobdat);
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
@ -569,6 +602,7 @@ static int update_state(orte_jobid_t job,
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
/* indicate that the job is complete */
return rc;
}
@ -594,6 +628,131 @@ int ft_event(int state)
return ORTE_SUCCESS;
}
int post_startup(void) {
return ORTE_SUCCESS;
}
int pre_shutdown(void) {
return ORTE_SUCCESS;
}
int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
int i;
orte_process_name_t *name_item;
opal_list_item_t *item;
orte_odls_child_t *child;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking procs as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
opal_output(0, "NULL found in dead process list.");
continue;
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking %s as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item)));
}
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
continue;
}
/* Increment the epoch */
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
orte_util_set_epoch(name_item, name_item->epoch + 1);
/* Remove the dead process from my list of children if applicable */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t *) item;
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID,
child->name, name_item)) {
opal_list_remove_item(&orte_local_children, item);
break;
}
}
/* Remove the route from the routing layer */
orte_routed.delete_route(name_item);
}
/* Update the routing module */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
if (NULL != fault_cbfunc) {
(*fault_cbfunc)(dead_procs);
}
return ORTE_SUCCESS;
}
int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
opal_pointer_array_t *dead_names;
orte_std_cntr_t n;
int ret = ORTE_SUCCESS, num_failed;
int32_t i;
orte_process_name_t *name_item, proc;
dead_names = OBJ_NEW(opal_pointer_array_t);
n = 1;
/* Get the number of failed procs */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
for (i = 0; i < num_failed; i++) {
/* Unpack the buffer to get the dead process' name. */
n = 1;
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item),
ORTE_NAME_PRINT(sender));
}
/* There shouldn't be an issue of receiving this message multiple
* times but it doesn't hurt to double check.
*/
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
continue;
}
opal_pointer_array_add(dead_names, name_item);
}
/* Tell the errmgr so it can handle changing the epoch, routes, etc. */
orte_errmgr.mark_processes_as_dead(dead_names);
/* Tell the applications' ORTE layers that there is a failure. */
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
return ret;
}
for (i = 0; i < num_failed; i++) {
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
free(name_item);
}
return ret;
}
/*****************
* Local Functions
*****************/
@ -601,14 +760,14 @@ static bool any_live_children(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
child->alive) {
@ -618,13 +777,13 @@ static bool any_live_children(orte_jobid_t job)
/* if we get here, then nobody is left alive from that job */
return false;
}
static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
{
int rc;
/* pack the child's vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
@ -659,70 +818,70 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static bool all_children_registered(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* if this child has terminated, we consider it as having
@ -748,10 +907,10 @@ static bool all_children_registered(orte_jobid_t job)
}
}
}
/* if we get here, then everyone in the job is currently registered */
return true;
}
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
@ -759,14 +918,14 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
opal_list_item_t *item;
orte_odls_child_t *child;
int rc;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* pack the child's vpid - must be done in case rml_uri is NULL */
@ -774,6 +933,11 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
ORTE_ERROR_LOG(rc);
return rc;
}
/* Pack the child's epoch. */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the contact info */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
@ -781,19 +945,19 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
}
}
}
return ORTE_SUCCESS;
}
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* set the state */
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -822,7 +986,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* update job state */
jobdat->state = jobstate;
/* update children */
@ -836,28 +1000,29 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
int rc;
/* stop local sensors for this job */
if (ORTE_VPID_WILDCARD == vpid) {
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
return;
}
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -865,3 +1030,85 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
OBJ_DESTRUCT(&cmd);
OBJ_DESTRUCT(&proc);
}
static int record_dead_process(orte_process_name_t *proc) {
opal_pointer_array_t *dead_name;
opal_buffer_t *buffer;
orte_daemon_cmd_flag_t command;
int rc = ORTE_SUCCESS;
int num_failed;
if (orte_odls_base_default_check_finished(proc)) {
return rc;
}
dead_name = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_add(dead_name, proc);
/* Mark the process as dead */
mark_processes_as_dead(dead_name);
/* Send a message to the HNP */
buffer = OBJ_NEW(opal_buffer_t);
command = ORTE_PROCESS_FAILED_NOTIFICATION;
num_failed = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
}
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
OBJ_RELEASE(buffer);
OBJ_RELEASE(dead_name);
return rc;
}
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret;
orte_process_name_t *name_item;
int size, i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending failure to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
buf = OBJ_NEW(opal_buffer_t);
size = opal_pointer_array_get_size(dead_names);
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
for (i = 0; i < size; i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
}
}
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
OBJ_RELEASE(buf);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -61,6 +61,7 @@ orte_ess_base_module_t orte_ess_alps_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch,
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -264,10 +265,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_ALL;
/* is this me? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
/* yes it is - reply with my rank. This is necessary
* because the pidmap will not have arrived when I
* am starting up, and if we use static ports, then
@ -348,6 +351,7 @@ static int alps_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = (orte_vpid_t) cnos_get_rank() + starting_vpid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:alps set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -363,5 +367,9 @@ static int alps_set_name(void)
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -86,6 +86,8 @@ ORTE_DECLSPEC int orte_ess_env_put(orte_std_cntr_t num_procs,
orte_std_cntr_t num_local_procs,
char ***env);
ORTE_DECLSPEC orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -46,6 +46,10 @@ int orte_ess_env_get(void)
return ORTE_ERR_NOT_FOUND;
}
orte_process_info.num_procs = (orte_std_cntr_t)num_procs;
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -24,10 +24,8 @@
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "orte/mca/ess/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
@ -38,10 +36,19 @@
opal_list_t orte_ess_base_components_available;
orte_ess_base_module_t orte_ess = {
NULL, /* init */
NULL, /* finalize */
NULL, /* abort */
NULL /* ft_event */
NULL, /* init */
NULL, /* finalize */
NULL, /* abort */
NULL, /* proc_get_locality */
NULL, /* proc_get_daemon */
NULL, /* proc_get_hostname */
NULL, /* get_local_rank */
NULL, /* get_node_rank */
NULL, /* proc_get_epoch */
NULL, /* update_pidmap */
NULL, /* update_nidmap */
NULL, /* query_sys_info */
NULL /* ft_event */
};
int orte_ess_base_output;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -24,10 +24,33 @@
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_component_repository.h"
#include "orte/util/nidmap.h"
#include "orte/mca/ess/base/base.h"
extern opal_list_t orte_ess_base_components_available;
/**
* Generic function to retrieve the epoch of a specific process
* from the job data.
*/
orte_epoch_t orte_ess_base_proc_get_epoch(orte_process_name_t *proc)
{
orte_epoch_t epoch;
if (ORTE_EPOCH_INVALID == (epoch = orte_util_lookup_epoch(proc))) {
return ORTE_NODE_RANK_INVALID;
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:generic: proc %s has epoch %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
epoch));
return epoch;
}
int
orte_ess_base_select(void)
{

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -78,7 +78,7 @@ int orte_ess_base_app_setup(void)
error = "orte_errmgr_base_select";
goto error;
}
/* Setup the communication infrastructure */
/* Runtime Messaging Layer */
@ -92,6 +92,7 @@ int orte_ess_base_app_setup(void)
error = "orte_rml_base_select";
goto error;
}
/* Routed system */
if (ORTE_SUCCESS != (ret = orte_routed_base_open())) {
ORTE_ERROR_LOG(ret);
@ -238,6 +239,13 @@ int orte_ess_base_app_setup(void)
goto error;
}
/* Execute the post-startup errmgr code */
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
}
/* if we are an ORTE app - and not an MPI app - then
* we need to barrier here. MPI_Init has its own barrier,
* so we don't need to do two of them. However, if we
@ -270,6 +278,8 @@ error:
int orte_ess_base_app_finalize(void)
{
orte_errmgr.pre_shutdown();
orte_notifier_base_close();
orte_cr_finalize();

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -327,7 +327,7 @@ int orte_ess_base_orted_setup(char **hosts)
/* be sure to update the routing tree so the initial "phone home"
* to mpirun goes through the tree!
*/
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree())) {
if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ret);
error = "failed to update routing tree";
goto error;
@ -514,6 +514,13 @@ int orte_ess_base_orted_setup(char **hosts)
/* start the local sensors */
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* Execute the post-startup errmgr code */
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
}
return ORTE_SUCCESS;
error:
@ -526,6 +533,8 @@ int orte_ess_base_orted_setup(char **hosts)
int orte_ess_base_orted_finalize(void)
{
orte_errmgr.pre_shutdown();
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -58,6 +58,7 @@ orte_ess_base_module_t orte_ess_cnos_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* get_epoch */
NULL, /* add_pidmap is only used in ORTE */
NULL, /* update_nidmap is only used in ORTE */
query_sys_info,
@ -87,6 +88,10 @@ static int rte_init(void)
/* Get the number of procs in the job from cnos */
orte_process_info.num_procs = (orte_std_cntr_t) cnos_get_size();
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* Get the nid map */
nprocs = cnos_get_nidpid_map(&map);

12
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -99,6 +99,7 @@ orte_ess_base_module_t orte_ess_env_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -305,10 +306,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* is this me? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
/* yes it is - reply with my rank. This is necessary
* because the pidmap will not have arrived when I
* am starting up, and if we use static ports, then
@ -386,9 +389,10 @@ static int env_set_name(void)
return(rc);
}
free(tmp);
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:env set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2010 High Performance Computing Center Stuttgart,
@ -104,6 +104,15 @@ typedef orte_local_rank_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_
*/
typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc);
/**
* Update the epoch
*
* The epochs of the processes are stored in the process_name struct, but this
* will get the most up to date version stored within the orte_proc_t struct.
* Obviously the epoch of the proc that is passed in will be ignored.
*/
typedef orte_epoch_t (*orte_ess_base_module_proc_get_epoch_fn_t)(orte_process_name_t *proc);
/**
* Update the pidmap
*
@ -154,6 +163,7 @@ struct orte_ess_base_module_1_0_0_t {
orte_ess_base_module_proc_get_hostname_fn_t proc_get_hostname;
orte_ess_base_module_proc_get_local_rank_fn_t get_local_rank;
orte_ess_base_module_proc_get_node_rank_fn_t get_node_rank;
orte_ess_base_module_proc_get_epoch_fn_t proc_get_epoch;
orte_ess_base_module_update_pidmap_fn_t update_pidmap;
orte_ess_base_module_update_nidmap_fn_t update_nidmap;
orte_ess_base_module_query_sys_info_t query_sys_info;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -95,6 +95,7 @@ orte_ess_base_module_t orte_ess_generic_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch,
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -154,6 +155,7 @@ static int rte_init(void)
goto error;
}
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s completed name definition",
@ -165,6 +167,10 @@ static int rte_init(void)
goto error;
}
orte_process_info.num_procs = strtol(envar, NULL, 10);
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* set the app_num so that MPI attributes get set correctly */
orte_process_info.app_num = 1;
@ -267,6 +273,7 @@ static int rte_init(void)
if (vpid == ORTE_PROC_MY_NAME->vpid) {
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = i;
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s node %d name %s rank %s",
@ -297,6 +304,7 @@ static int rte_init(void)
if (vpid == ORTE_PROC_MY_NAME->vpid) {
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = i;
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s node %d name %s rank %d",

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -75,6 +75,7 @@
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/comm/comm.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wait.h"
@ -108,6 +109,7 @@ orte_ess_base_module_t orte_ess_hnp_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -490,6 +492,8 @@ static int rte_init(void)
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
proc->name.epoch = ORTE_EPOCH_MIN;
proc->pid = orte_process_info.pid;
proc->rml_uri = orte_rml.get_contact_info();
proc->state = ORTE_PROC_STATE_RUNNING;
@ -820,6 +824,7 @@ static uint8_t proc_get_locality(orte_process_name_t *proc)
orte_node_t *node;
orte_proc_t *myproc;
int i;
orte_ns_cmp_bitmask_t mask;
/* the HNP is always on node=0 of the node array */
node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
@ -829,8 +834,10 @@ static uint8_t proc_get_locality(orte_process_name_t *proc)
if (NULL == (myproc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (myproc->name.jobid == proc->jobid &&
myproc->name.vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &myproc->name, proc)) {
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:hnp: proc %s is LOCAL",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -877,7 +884,7 @@ static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
return ORTE_VPID_INVALID;
}
if( NULL == pdata->node->daemon ) {
if( NULL == pdata->node || NULL == pdata->node->daemon ) {
return ORTE_VPID_INVALID;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -67,6 +67,7 @@ orte_ess_base_module_t orte_ess_lsf_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -271,10 +272,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* is this me? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
/* yes it is - reply with my rank. This is necessary
* because the pidmap will not have arrived when I
* am starting up, and if we use static ports, then
@ -354,6 +357,7 @@ static int lsf_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
/* fix up the base name and make it the "real" name */
lsf_nodeid = atoi(getenv("LSF_PM_TASKID"));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -56,6 +56,7 @@ orte_ess_base_module_t orte_ess_portals4_shmem_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
NULL, /* add_pidmap is only used in ORTE */
NULL, /* update_nidmap is only used in ORTE */
query_sys_info,
@ -85,6 +86,10 @@ static int rte_init(void)
/* Get the number of procs in the job from portals4_shmem */
orte_process_info.num_procs = (orte_std_cntr_t) runtime_get_size();
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* Get the nid map */
nprocs = runtime_get_nidpid_map(&map);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -54,6 +54,7 @@ orte_ess_base_module_t orte_ess_portals_utcp_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
NULL, /* add_pidmap is only used in ORTE */
NULL, /* update_nidmap is only used in ORTE */
query_sys_info,
@ -91,6 +92,7 @@ static int rte_init(void)
return(rc);
}
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = ORTE_MIN_EPOCH;
/*
* Get the number of procs in the job. We assume vpids start at 0. We
@ -102,6 +104,10 @@ static int rte_init(void)
nidmap = opal_argv_split(nidmap_string, ':');
orte_process_info.num_procs = (orte_std_cntr_t) opal_argv_count(nidmap);
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* MPI_Init needs the grpcomm framework, so we have to init it */
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_open())) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -88,6 +88,7 @@ orte_ess_base_module_t orte_ess_singleton_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -187,6 +188,7 @@ static int rte_init(void)
/* set the name */
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0;
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
} else {
/*
@ -231,6 +233,10 @@ static int rte_init(void)
}
orte_process_info.num_procs = 1;
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* NOTE: do not wireup our io - let the fork'd orted serve
* as our io handler. This prevents issues with the event

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -98,6 +98,7 @@ orte_ess_base_module_t orte_ess_slave_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -183,9 +184,12 @@ static uint8_t proc_get_locality(orte_process_name_t *proc)
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* if it is me, the answer is my daemon's vpid */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
return ORTE_PROC_MY_DAEMON->vpid;
}
@ -195,9 +199,11 @@ static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
static char* proc_get_hostname(orte_process_name_t *proc)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* if it is me, the answer is my nodename */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
return orte_process_info.nodename;
}
@ -207,9 +213,11 @@ static char* proc_get_hostname(orte_process_name_t *proc)
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
/* if it is me, the local rank is zero */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
return 0;
}
@ -272,6 +280,7 @@ static int slave_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:slave set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -72,6 +72,7 @@ orte_ess_base_module_t orte_ess_slurm_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -275,10 +276,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_ALL;
/* is this me? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
/* yes it is - reply with my rank. This is necessary
* because the pidmap will not have arrived when I
* am starting up, and if we use static ports, then
@ -367,8 +370,8 @@ static int slurm_set_name(void)
/* fix up the vpid and make it the "real" vpid */
slurm_nodeid = atoi(getenv("SLURM_NODEID"));
ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -81,6 +81,7 @@ orte_ess_base_module_t orte_ess_slurmd_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -189,7 +190,7 @@ static int rte_init(void)
}
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
#endif
ORTE_PROC_MY_NAME->epoch = ORTE_EPOCH_MIN;
/* get our local rank */
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
error = "could not get SLURM_LOCALID";
@ -216,11 +217,15 @@ static int rte_init(void)
orte_process_info.num_procs = strtol(envar, NULL, 10);
#endif
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
#if WANT_SLURM_PMI_SUPPORT
if (PMI_SUCCESS != PMI_Get_appnum(&i)) {
error = "PMI_Get_appnum failed";
goto error;
}
orte_process_info.app_num = i;
#else
/* set the app_num so that MPI attributes get set correctly */
@ -250,6 +255,7 @@ static int rte_init(void)
nodeid = strtol(envar, NULL, 10);
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = nodeid;
ORTE_PROC_MY_DAEMON->epoch = ORTE_PROC_MY_NAME->epoch;
/* get the number of ppn */
if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) {
@ -338,6 +344,7 @@ static int rte_init(void)
opal_pointer_array_add(&orte_jobmap, jmap);
/* update the num procs */
jmap->num_procs = orte_process_info.num_procs;
/* set the size of the pidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -70,6 +70,7 @@ orte_ess_base_module_t orte_ess_tm_module = {
proc_get_hostname,
proc_get_local_rank,
proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
update_pidmap,
update_nidmap,
orte_ess_base_query_sys_info,
@ -273,10 +274,12 @@ static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
orte_pmap_t *pmap;
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_ALL;
/* is this me? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
proc->vpid == ORTE_PROC_MY_NAME->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
/* yes it is - reply with my rank. This is necessary
* because the pidmap will not have arrived when I
* am starting up, and if we use static ports, then
@ -361,6 +364,7 @@ static int tm_set_name(void)
ORTE_PROC_MY_NAME->jobid = jobid;
ORTE_PROC_MY_NAME->vpid = vpid;
ORTE_PROC_MY_NAME->epoch = orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"ess:tm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -57,6 +57,7 @@ orte_ess_base_module_t orte_ess_tool_module = {
NULL, /* don't need a proc_get_hostname fn */
NULL, /* don't need a proc_get_local_rank fn */
NULL, /* don't need a proc_get_node_rank fn */
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
NULL, /* don't need to update_pidmap */
NULL, /* don't need to update_nidmap */
query_sys_info,

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
@ -1097,8 +1097,11 @@ static int orte_filem_rsh_start_command(orte_filem_base_process_set_t *proc_set
if( NULL != proc_set ) {
wp_item->proc_set.source.jobid = proc_set->source.jobid;
wp_item->proc_set.source.vpid = proc_set->source.vpid;
wp_item->proc_set.source.epoch = proc_set->source.epoch;
wp_item->proc_set.sink.jobid = proc_set->sink.jobid;
wp_item->proc_set.sink.vpid = proc_set->sink.vpid;
wp_item->proc_set.sink.epoch = proc_set->sink.epoch;
}
/* Copy the File Set */
if( NULL != file_set ) {
@ -1346,6 +1349,7 @@ static void orte_filem_rsh_permission_callback(int status,
int num_req, num_allowed = 0;
int perm_flag, i;
int32_t peer_status = 0;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: permission_callback(? ?): Peer %s ...",
@ -1392,6 +1396,7 @@ static void orte_filem_rsh_permission_callback(int status,
wp_item = OBJ_NEW(orte_filem_rsh_work_pool_item_t);
wp_item->proc_set.source.jobid = sender->jobid;
wp_item->proc_set.source.vpid = sender->vpid;
wp_item->proc_set.source.epoch = sender->epoch;
opal_list_append(&work_pool_waiting, &(wp_item->super));
}
@ -1443,8 +1448,10 @@ static void orte_filem_rsh_permission_callback(int status,
item != opal_list_get_end( &work_pool_pending);
item = opal_list_get_next( item) ) {
wp_item = (orte_filem_rsh_work_pool_item_t *)item;
if(sender->jobid == wp_item->proc_set.source.jobid &&
sender->vpid == wp_item->proc_set.source.vpid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, sender, &wp_item->proc_set.source)) {
opal_list_remove_item( &work_pool_pending, item);
break;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -168,6 +168,9 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
/* I send first */
peer.vpid = vpids[1];
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* setup a temp buffer so I can inform the other side as to the
* number of entries in my buffer
*/
@ -223,6 +226,9 @@ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_e
opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
opal_dss.copy_payload(&buf, sendbuf);
peer.vpid = vpids[0];
peer.epoch = orte_ess.proc_get_epoch(&peer);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:coll:two-proc sending to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -314,6 +320,9 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
/* first send my current contents */
nv = (rank - distance + np) % np;
peer.vpid = vpids[nv];
peer.epoch = orte_ess.proc_get_epoch(&peer);
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
opal_dss.copy_payload(&buf, &collection);
@ -331,6 +340,9 @@ static int bruck(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_ent
num_recvd = 0;
nv = (rank + distance) % np;
peer.vpid = vpids[nv];
peer.epoch = orte_ess.proc_get_epoch(&peer);
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer,
ORTE_RML_TAG_DAEMON_COLLECTIVE,
@ -427,6 +439,9 @@ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int
/* first send my current contents */
nv = rank ^ distance;
peer.vpid = vpids[nv];
peer.epoch = orte_ess.proc_get_epoch(&peer);
OBJ_CONSTRUCT(&buf, opal_buffer_t);
opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
opal_dss.copy_payload(&buf, &collection);
@ -631,6 +646,8 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
proc.jobid = jobid;
proc.vpid = 0;
while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) {
proc.epoch = orte_ess.proc_get_epoch(&proc);
/* get the daemon that hosts this proc */
daemonvpid = orte_ess.proc_get_daemon(&proc);
/* is this daemon one of our children, or at least its contribution
@ -695,6 +712,8 @@ void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender,
/* send it */
my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
my_parent.vpid = orte_routed.get_routing_tree(NULL);
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
"%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -41,6 +41,8 @@
#include "orte/orted/orted.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "opal/mca/paffinity/paffinity.h"
#include "orte/mca/grpcomm/base/base.h"
#include "grpcomm_hier.h"
@ -93,6 +95,7 @@ static int init(void)
my_local_rank_zero_proc.jobid = ORTE_PROC_MY_NAME->jobid;
my_local_rank_zero_proc.vpid = ORTE_VPID_INVALID;
my_local_rank_zero_proc.epoch = ORTE_EPOCH_INVALID;
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) {
ORTE_ERROR_LOG(rc);
@ -267,6 +270,8 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
proc.jobid = ORTE_PROC_MY_NAME->jobid;
for (v=0; v < orte_process_info.num_procs; v++) {
proc.vpid = v;
proc.epoch = orte_util_lookup_epoch(&proc);
/* is this proc local_rank=0 on its node? */
if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) {
my_coll_peers[cpeers++] = v;
@ -280,12 +285,15 @@ static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
nm = OBJ_NEW(orte_namelist_t);
nm->name.jobid = proc.jobid;
nm->name.vpid = proc.vpid;
nm->name.epoch = proc.epoch;
opal_list_append(&my_local_peers, &nm->item);
/* if I am not local_rank=0, is this one? */
if (0 != my_local_rank &&
0 == orte_ess.get_local_rank(&proc)) {
my_local_rank_zero_proc.jobid = proc.jobid;
my_local_rank_zero_proc.vpid = proc.vpid;
my_local_rank_zero_proc.epoch = proc.epoch;
}
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -135,6 +135,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
ep = OBJ_NEW(orte_iof_sink_t); \
ep->name.jobid = (nm)->jobid; \
ep->name.vpid = (nm)->vpid; \
ep->name.epoch = (nm)->epoch; \
ep->tag = (tg); \
if (0 <= (fid)) { \
ep->wev->fd = (fid); \
@ -168,6 +169,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
rev = OBJ_NEW(orte_iof_read_event_t); \
rev->name.jobid = (nm)->jobid; \
rev->name.vpid = (nm)->vpid; \
rev->name.epoch = (nm)->epoch; \
rev->tag = (tg); \
rev->fd = (fid); \
*(rv) = rev; \
@ -192,6 +194,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
ep = OBJ_NEW(orte_iof_sink_t); \
ep->name.jobid = (nm)->jobid; \
ep->name.vpid = (nm)->vpid; \
ep->name.epoch = (nm)->epoch; \
ep->tag = (tg); \
if (0 <= (fid)) { \
ep->wev->fd = (fid); \
@ -212,6 +215,7 @@ typedef struct orte_iof_base_t orte_iof_base_t;
rev = OBJ_NEW(orte_iof_read_event_t); \
rev->name.jobid = (nm)->jobid; \
rev->name.vpid = (nm)->vpid; \
rev->name.epoch= (nm)->epoch; \
rev->tag = (tg); \
*(rv) = rev; \
opal_event_set(opal_event_base, \

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -91,6 +91,7 @@ static void orte_iof_base_sink_construct(orte_iof_sink_t* ptr)
{
ptr->daemon.jobid = ORTE_JOBID_INVALID;
ptr->daemon.vpid = ORTE_VPID_INVALID;
ptr->daemon.epoch = ORTE_EPOCH_INVALID;
ptr->wev = OBJ_NEW(orte_iof_write_event_t);
}
static void orte_iof_base_sink_destruct(orte_iof_sink_t* ptr)

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -42,6 +42,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/mca/odls/odls_types.h"
@ -147,6 +148,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
orte_odls_job_t *jobdat=NULL;
int np, numdigs;
int rc;
orte_ns_cmp_bitmask_t mask;
/* don't do this if the dst vpid is invalid or the fd is negative! */
if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) {
@ -174,8 +176,8 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
item != opal_list_get_end(&mca_iof_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == dst_name->jobid &&
proct->name.vpid == dst_name->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
/* found it */
goto SETUP;
}
@ -184,6 +186,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
proct->name.epoch = dst_name->epoch;
opal_list_append(&mca_iof_hnp_component.procs, &proct->super);
/* see if we are to output to a file */
if (NULL != orte_output_filename) {
@ -278,6 +281,7 @@ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag,
&mca_iof_hnp_component.sinks);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = proc->node->daemon->name.vpid;
sink->daemon.epoch = orte_ess.proc_get_epoch(&sink->daemon);
}
}
@ -384,6 +388,7 @@ static int hnp_pull(const orte_process_name_t* dst_name,
&mca_iof_hnp_component.sinks);
sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;
sink->daemon.epoch = ORTE_PROC_MY_NAME->epoch;
return ORTE_SUCCESS;
}
@ -397,15 +402,17 @@ static int hnp_close(const orte_process_name_t* peer,
{
opal_list_item_t *item, *next_item;
orte_iof_sink_t* sink;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(&mca_iof_hnp_component.sinks);
item != opal_list_get_end(&mca_iof_hnp_component.sinks);
item = next_item ) {
sink = (orte_iof_sink_t*)item;
next_item = opal_list_get_next(item);
mask = ORTE_NS_CMP_ALL;
if((sink->name.jobid == peer->jobid) &&
(sink->name.vpid == peer->vpid) &&
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
(source_tag & sink->tag)) {
/* No need to delete the event or close the file

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -89,6 +89,7 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
opal_list_item_t *item, *prev_item;
orte_iof_proc_t *proct;
int rc;
orte_ns_cmp_bitmask_t mask;
OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock);
@ -146,9 +147,10 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
continue;
}
mask = ORTE_NS_CMP_ALL;
/* if the daemon is me, then this is a local sink */
if (ORTE_PROC_MY_NAME->jobid == sink->daemon.jobid &&
ORTE_PROC_MY_NAME->vpid == sink->daemon.vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &sink->daemon)) {
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s read %d bytes from stdin - writing to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
@ -258,8 +260,8 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
item != opal_list_get_end(&mca_iof_hnp_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == rev->name.jobid &&
proct->name.vpid == rev->name.vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
/* found it - release corresponding event. This deletes
* the read event and closes the file descriptor
*/
@ -317,8 +319,9 @@ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata)
continue;
}
/* is this the desired proc? */
if (sink->name.jobid == rev->name.jobid &&
sink->name.vpid == rev->name.vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) {
/* output to the corresponding file */
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
/* done */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -67,7 +67,7 @@ static void process_msg(int fd, short event, void *cbdata)
ORTE_ERROR_LOG(rc);
goto CLEAN_RETURN;
}
if (ORTE_IOF_XON & stream) {
/* re-start the stdin read event */
if (NULL != mca_iof_hnp_component.stdinev &&
@ -109,18 +109,21 @@ static void process_msg(int fd, short event, void *cbdata)
NULL, &mca_iof_hnp_component.sinks);
sink->daemon.jobid = mev->sender.jobid;
sink->daemon.vpid = mev->sender.vpid;
sink->daemon.epoch = mev->sender.epoch;
}
if (ORTE_IOF_STDERR & stream) {
ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR,
NULL, &mca_iof_hnp_component.sinks);
sink->daemon.jobid = mev->sender.jobid;
sink->daemon.vpid = mev->sender.vpid;
sink->daemon.epoch = mev->sender.epoch;
}
if (ORTE_IOF_STDDIAG & stream) {
ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG,
NULL, &mca_iof_hnp_component.sinks);
sink->daemon.jobid = mev->sender.jobid;
sink->daemon.vpid = mev->sender.vpid;
sink->daemon.epoch = mev->sender.epoch;
}
goto CLEAN_RETURN;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -128,6 +128,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
int fdout;
orte_odls_job_t *jobdat=NULL;
int np, numdigs;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
"%s iof:orted pushing fd %d for process %s",
@ -150,8 +151,10 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
item != opal_list_get_end(&mca_iof_orted_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == dst_name->jobid &&
proct->name.vpid == dst_name->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
/* found it */
goto SETUP;
}
@ -160,6 +163,7 @@ static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_ta
proct = OBJ_NEW(orte_iof_proc_t);
proct->name.jobid = dst_name->jobid;
proct->name.vpid = dst_name->vpid;
proct->name.epoch = dst_name->epoch;
opal_list_append(&mca_iof_orted_component.procs, &proct->super);
/* see if we are to output to a file */
if (NULL != orte_output_filename) {
@ -285,6 +289,7 @@ static int orted_close(const orte_process_name_t* peer,
{
opal_list_item_t *item, *next_item;
orte_iof_sink_t* sink;
orte_ns_cmp_bitmask_t mask;
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
@ -294,8 +299,9 @@ static int orted_close(const orte_process_name_t* peer,
sink = (orte_iof_sink_t*)item;
next_item = opal_list_get_next(item);
if((sink->name.jobid == peer->jobid) &&
(sink->name.vpid == peer->vpid) &&
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
(source_tag & sink->tag)) {
/* No need to delete the event or close the file

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -65,6 +65,7 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
int32_t numbytes;
opal_list_item_t *item;
orte_iof_proc_t *proct;
orte_ns_cmp_bitmask_t mask;
OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
@ -119,9 +120,11 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
if (ORTE_IOF_STDIN & sink->tag) {
continue;
}
mask = ORTE_NS_CMP_ALL;
/* is this the desired proc? */
if (sink->name.jobid == rev->name.jobid &&
sink->name.vpid == rev->name.vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) {
/* output to the corresponding file */
orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev);
/* done */
@ -178,8 +181,8 @@ CLEAN_RETURN:
item != opal_list_get_end(&mca_iof_orted_component.procs);
item = opal_list_get_next(item)) {
proct = (orte_iof_proc_t*)item;
if (proct->name.jobid == rev->name.jobid &&
proct->name.vpid == rev->name.vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
/* found it - release corresponding event. This deletes
* the read event and closes the file descriptor
*/

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -84,6 +84,9 @@ ORTE_DECLSPEC void orte_base_default_waitpid_fired(orte_process_name_t *proc, in
/* setup singleton job data */
ORTE_DECLSPEC void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid);
/* Lookup function to see if the child process has already finished. */
ORTE_DECLSPEC bool orte_odls_base_default_check_finished(orte_process_name_t *proc);
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -513,7 +513,7 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
ORTE_ERROR_LOG(rc);
}
/* update the routing tree */
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -556,7 +556,7 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
return rc;
}
/* update the routing tree */
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -620,7 +620,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
char **slot_str=NULL;
orte_jobid_t debugger;
bool add_child;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:constructing child list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -945,6 +946,8 @@ find_my_procs:
proc.jobid = jobdat->jobid;
for (j=0; j < jobdat->num_procs; j++) {
proc.vpid = j;
proc.epoch = ORTE_EPOCH_INVALID;
proc.epoch = orte_ess.proc_get_epoch(&proc);
/* get the vpid of the daemon that is to host this proc */
if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&proc))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
@ -976,8 +979,11 @@ find_my_procs:
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc.jobid &&
child->name->vpid == proc.vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, child->name, &proc)) {
/* do not duplicate this child on the list! */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"proc %s is on list and is %s",
@ -1243,6 +1249,20 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
free(param);
free(value);
/* setup the epoch */
if (ORTE_SUCCESS != (rc = orte_util_convert_epoch_to_string(&value, child->name->epoch))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","epoch"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
/* setup the vpid */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) {
ORTE_ERROR_LOG(rc);
@ -2419,6 +2439,7 @@ void orte_odls_base_notify_iof_complete(orte_process_name_t *proc)
orte_odls_child_t *child;
opal_list_item_t *item;
int rc;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:notify_iof_complete for child %s",
@ -2437,9 +2458,10 @@ void orte_odls_base_notify_iof_complete(orte_process_name_t *proc)
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) { /* found it */
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { /* found it */
goto GOTCHILD;
}
}
@ -2497,6 +2519,7 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
opal_list_item_t *item;
opal_buffer_t buffer;
int rc;
orte_ns_cmp_bitmask_t mask;
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
@ -2510,9 +2533,11 @@ void orte_odls_base_default_report_abort(orte_process_name_t *proc)
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (proc->jobid == child->name->jobid &&
proc->vpid == child->name->vpid) { /* found it */
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */
child->state = ORTE_PROC_STATE_CALLED_ABORT;
/* send ack */
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
@ -2533,6 +2558,7 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
orte_odls_job_t *jobdat, *jdat;
opal_list_item_t *item;
int rc;
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired on child %s with status %d",
@ -2552,8 +2578,10 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (proc->jobid == child->name->jobid &&
proc->vpid == child->name->vpid) { /* found it */
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */
goto GOTCHILD;
}
}
@ -2893,6 +2921,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
OBJ_CONSTRUCT(&proctmp, orte_proc_t);
proctmp.name.jobid = ORTE_JOBID_WILDCARD;
proctmp.name.vpid = ORTE_VPID_WILDCARD;
proctmp.name.epoch = ORTE_EPOCH_WILDCARD;
opal_pointer_array_add(&procarray, &proctmp);
procptr = &procarray;
do_cleanup = true;
@ -3258,3 +3287,26 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
return rc;
}
bool orte_odls_base_default_check_finished(orte_process_name_t *proc) {
orte_odls_child_t *child;
opal_list_item_t *item;
orte_ns_cmp_bitmask_t mask;
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
/* find this child */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, child->name)) { /* found it */
return child->fini_recvd;
}
}
return false;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -41,6 +41,7 @@
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
#include "orte/util/parse_options.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/odls/base/odls_private.h"
@ -185,6 +186,7 @@ int orte_odls_base_open(void)
if (-1 == rank) {
/* wildcard */
nm->name.vpid = ORTE_VPID_WILDCARD;
nm->name.epoch = ORTE_EPOCH_WILDCARD;
} else if (rank < 0) {
/* error out on bozo case */
orte_show_help("help-odls-base.txt",
@ -197,6 +199,7 @@ int orte_odls_base_open(void)
* will be in the job - we'll check later
*/
nm->name.vpid = rank;
nm->name.epoch = orte_ess.proc_get_epoch(&nm->name);
}
opal_list_append(&orte_odls_globals.xterm_ranks, &nm->item);
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -77,14 +77,17 @@ int orte_odls_base_preload_files_app_context(orte_app_context_t* app_context)
/* if I am the HNP, then use me as the source */
p_set->source.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->source.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->source.epoch = ORTE_PROC_MY_NAME->epoch;
}
else {
/* otherwise, set the HNP as the source */
p_set->source.jobid = ORTE_PROC_MY_HNP->jobid;
p_set->source.vpid = ORTE_PROC_MY_HNP->vpid;
p_set->source.epoch = ORTE_PROC_MY_HNP->epoch;
}
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
opal_list_append(&(filem_request->process_sets), &(p_set->super) );

Просмотреть файл

@ -1,7 +1,7 @@
/* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -88,6 +88,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
/* process called "errmgr.abort_procs" */
#define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28
/* commands used for fault recovery */
#define ORTE_PROCESS_FAILED_NOTIFICATION (orte_daemon_cmd_flag_t) 30
/*
* List object to locally store the process names and pids of
* our children. This can subsequently be used to order termination

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -164,8 +164,7 @@ int mca_oob_tcp_component_open(void)
#ifdef __WINDOWS__
WSADATA win_sock_data;
if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) {
opal_output (0, "mca_oob_tcp_component_open: failed to initialise windows sockets: error %d\n",
WSAGetLastError());
opal_output (0, "mca_oob_tcp_component_init: failed to initialise windows sockets: error %d\n", WSAGetLastError());
return ORTE_ERROR;
}
#endif
@ -432,7 +431,7 @@ int mca_oob_tcp_component_close(void)
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.tcp_available_devices))) {
OBJ_RELEASE(item);
}
#if 0
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_connections_lock);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_connections_return);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_pending_connections);
@ -452,6 +451,7 @@ int mca_oob_tcp_component_close(void)
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_list);
opal_output_close(mca_oob_tcp_output_handle);
#endif
return ORTE_SUCCESS;
}
@ -1975,7 +1975,7 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri)
peer->peer_state = MCA_OOB_TCP_CLOSED;
/* clear any pending sends */
while (NULL != (item = opal_list_remove_first(&peer->peer_send_queue))) {
MCA_OOB_TCP_MSG_RETURN( ((mca_oob_tcp_msg_t *)item) );
OBJ_RELEASE(item);
}
peer->peer_send_msg = NULL;
/* clear any pending recvs */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -605,14 +605,26 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
peer->peer_state);
}
OPAL_OUTPUT_VERBOSE((1, mca_oob_tcp_output_handle,
"%s-%s mca_oob_tcp_peer_close(%p) sd %d state %d\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->peer_name)),
(void *) peer,
peer->peer_sd,
peer->peer_state));
mca_oob_tcp_peer_shutdown(peer);
/* inform the ERRMGR framework that we have lost a connection so
* it can decide if this is important, what to do about it, etc.
*/
if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(peer->peer_name.jobid, ORTE_JOB_STATE_COMM_FAILED,
&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED,
0, ORTE_ERROR_DEFAULT_EXIT_CODE)) {
if (ORTE_ERR_UNRECOVERABLE == orte_errmgr.update_state(
peer->peer_name.jobid,
ORTE_JOB_STATE_COMM_FAILED,
&peer->peer_name,
ORTE_PROC_STATE_COMM_FAILED,
0,
ORTE_ERROR_DEFAULT_EXIT_CODE)) {
/* Should free the peer lock before we abort so we don't
* get stuck in the orte_wait_kill when receiving messages in the
* tcp OOB
@ -891,11 +903,6 @@ int mca_oob_tcp_peer_send_ident(mca_oob_tcp_peer_t* peer)
static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
{
mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t *)user;
/* if we are abnormally terminating, ignore this */
if (orte_abnormal_term_ordered) {
return;
}
OPAL_THREAD_LOCK(&peer->peer_lock);
switch(peer->peer_state) {
case MCA_OOB_TCP_CONNECT_ACK:

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -62,10 +62,12 @@ int orte_plm_base_set_hnp_name(void)
/* set the name */
ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
ORTE_PROC_MY_NAME->vpid = 0;
ORTE_PROC_MY_NAME->epoch= ORTE_EPOCH_MIN;
/* copy it to the HNP field */
ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
ORTE_PROC_MY_HNP->epoch = ORTE_PROC_MY_NAME->epoch;
/* done */
return ORTE_SUCCESS;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -40,6 +40,7 @@
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/ras/ras.h"
#include "orte/mca/rmaps/rmaps.h"
@ -219,7 +220,12 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
* asked to communicate.
*/
orte_process_info.num_procs = jdatorted->num_procs;
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -371,6 +377,7 @@ int orte_plm_base_launch_apps(orte_jobid_t job)
/* push stdin - the IOF will know what to do with the specified target */
name.jobid = job;
name.vpid = jdata->stdin_target;
name.epoch = orte_ess.proc_get_epoch(&name);
if (ORTE_SUCCESS != (rc = orte_iof.push(&name, ORTE_IOF_STDIN, 0))) {
ORTE_ERROR_LOG(rc);
@ -606,7 +613,6 @@ CLEANUP:
} else {
orted_num_callback++;
}
}
static void orted_report_launch(int status, orte_process_name_t* sender,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -34,6 +34,7 @@
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/runtime/orte_globals.h"
@ -162,6 +163,8 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
continue;
}
peer.vpid = v;
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* don't worry about errors on the send here - just
* issue it and keep going
*/
@ -238,6 +241,7 @@ int orte_plm_base_orted_terminate_job(orte_jobid_t jobid)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = jobid;
proc.name.vpid = ORTE_VPID_WILDCARD;
proc.name.epoch = ORTE_EPOCH_WILDCARD;
opal_pointer_array_add(&procs, &proc);
if (ORTE_SUCCESS != (rc = orte_plm_base_orted_kill_local_procs(&procs))) {
ORTE_ERROR_LOG(rc);
@ -335,6 +339,7 @@ int orte_plm_base_orted_kill_local_procs(opal_pointer_array_t *procs)
continue;
}
peer.vpid = v;
peer.epoch = orte_ess.proc_get_epoch(&peer);
/* check to see if this daemon is known to be "dead" */
if (proc->state > ORTE_PROC_STATE_UNTERMINATED) {
/* don't try to send this */

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -44,6 +44,7 @@
#include "orte/util/error_strings.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/routed/routed.h"
@ -145,6 +146,7 @@ static void process_msg(int fd, short event, void *data)
orte_job_t *jdata, *parent;
opal_buffer_t answer;
orte_vpid_t vpid;
orte_epoch_t epoch;
orte_proc_t *proc;
orte_proc_state_t state;
orte_exit_code_t exit_code;
@ -392,6 +394,9 @@ static void process_msg(int fd, short event, void *data)
break;
}
name.vpid = vpid;
name.epoch = ORTE_EPOCH_INVALID;
name.epoch = orte_ess.proc_get_epoch(&name);
/* unpack the pid */
count = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &pid, &count, OPAL_PID))) {
@ -467,6 +472,9 @@ static void process_msg(int fd, short event, void *data)
break;
case ORTE_PLM_INIT_ROUTES_CMD:
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive init routes command",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
count=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(msgpkt->buffer, &job, &count, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
@ -479,6 +487,15 @@ static void process_msg(int fd, short event, void *data)
break;
}
name.vpid = vpid;
count=1;
opal_dss.unpack(msgpkt->buffer, &epoch, &count, ORTE_EPOCH);
name.epoch = epoch;
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive Described rank %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name)));
/* update the errmgr state */
orte_errmgr.update_state(job, ORTE_JOB_STATE_REGISTERED,
&name, ORTE_PROC_STATE_REGISTERED,
@ -491,9 +508,17 @@ static void process_msg(int fd, short event, void *data)
if (ORTE_SUCCESS != (rc = orte_routed.init_routes(job, msgpkt->buffer))) {
ORTE_ERROR_LOG(rc);
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive done with init routes command",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
break;
default:
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive unknown command",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
break;
@ -516,7 +541,10 @@ static void process_msg(int fd, short event, void *data)
if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
orte_jobs_complete();
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:receive done processing commands",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
/*

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -54,6 +54,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/show_help.h"
@ -1525,6 +1526,8 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
{
char *param, *path, *tmp, *cmd, *basename, *dest_dir;
int i;
orte_epoch_t epoch;
orte_process_name_t proc;
/* if a prefix is set, pass it to the bootproxy in a special way */
if (NULL != app->prefix_dir) {
@ -1633,6 +1636,17 @@ int orte_plm_base_append_bootproxy_args(orte_app_context_t *app, char ***argv,
free(param);
opal_setenv("OMPI_COMM_WORLD_RANK", cmd, true, argv);
free(cmd);
/* set the epoch */
proc.jobid = jobid;
proc.vpid = vpid;
proc.epoch = ORTE_EPOCH_MIN;
epoch = orte_ess.proc_get_epoch(&proc);
orte_util_convert_epoch_to_string(&cmd, epoch);
param = mca_base_param_environ_variable("orte","ess","epoch");
opal_setenv(param, cmd, true, argv);
free(param);
free(cmd);
/* set the number of procs */
asprintf(&cmd, "%d", (int)num_procs);
@ -1727,6 +1741,7 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
orte_node_t *node_from_map, *node;
orte_odls_job_t *jobdat = NULL;
opal_list_item_t *item = NULL;
orte_ns_cmp_bitmask_t mask;
/* set the state to restart */
jdata->state = ORTE_JOB_STATE_RESTART;
@ -1751,8 +1766,10 @@ void orte_plm_base_reset_job(orte_job_t *jdata)
if (NULL == (proc_from_node = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (proc_from_node->name.jobid == proc->name.jobid &&
proc_from_node->name.vpid == proc->name.vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proc_from_node->name, &proc->name)) {
/* got it! */
OBJ_RELEASE(proc); /* keep accounting straight */
opal_pointer_array_set_item(node->procs, i, NULL);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,6 +30,7 @@
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/ess/ess.h"
#include "opal/mca/sysinfo/sysinfo_types.h"
#include "orte/util/show_help.h"
@ -451,6 +452,10 @@ int orte_rmaps_base_claim_slot(orte_job_t *jdata,
/* we do not set the vpid here - this will be done
* during a second phase
*/
/* We do set the epoch here since they all start with the same value. */
proc->name.epoch = ORTE_EPOCH_MIN;
proc->app_idx = app_idx;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
"%s rmaps:base:claim_slot: created new proc %s",
@ -554,6 +559,11 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
}
}
proc->name.vpid = vpid;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
/* If there is an invalid epoch here, it's because it doesn't exist yet. */
if (ORTE_NODE_RANK_INVALID == proc->name.epoch) {
proc->name.epoch = ORTE_EPOCH_MIN;
}
}
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
@ -590,6 +600,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
}
}
proc->name.vpid = vpid;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
}
if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
@ -822,6 +833,7 @@ int orte_rmaps_base_define_daemons(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.vpid = daemons->num_procs; /* take the next available vpid */
proc->name.epoch = ORTE_EPOCH_MIN;
proc->node = node;
proc->nodename = node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,
@ -1000,6 +1012,7 @@ int orte_rmaps_base_setup_virtual_machine(orte_job_t *jdata)
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc->name.vpid = jdata->num_procs; /* take the next available vpid */
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
proc->node = node;
proc->nodename = node->name;
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -40,6 +40,7 @@
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/show_help.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
@ -500,6 +501,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
}
}
proc->name.vpid = rank;
/* Either init or update the epoch. */
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
proc->slot_list = strdup(rfmap->slot_list);
/* insert the proc into the proper place */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -35,6 +35,7 @@
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/hostfile/hostfile.h"
#include "orte/util/dash_host/dash_host.h"
#include "orte/util/name_fns.h"
@ -234,6 +235,8 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
}
/* assign the vpid */
proc->name.vpid = vpid++;
proc->name.epoch = orte_ess.proc_get_epoch(&proc->name);
/* add to the jdata proc array */
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -341,6 +341,7 @@ static void recv_construct(rmcast_base_recv_t *ptr)
{
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch= ORTE_EPOCH_INVALID;
ptr->channel = ORTE_RMCAST_INVALID_CHANNEL;
OBJ_CONSTRUCT(&ptr->ctl, orte_thread_ctl_t);
ptr->seq_num = ORTE_RMCAST_SEQ_INVALID;
@ -428,6 +429,7 @@ static void recvlog_construct(rmcast_recv_log_t *ptr)
{
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch = ORTE_EPOCH_INVALID;
OBJ_CONSTRUCT(&ptr->last_msg, opal_list_t);
}
static void recvlog_destruct(rmcast_recv_log_t *ptr)
@ -436,6 +438,7 @@ static void recvlog_destruct(rmcast_recv_log_t *ptr)
ptr->name.jobid = ORTE_JOBID_INVALID;
ptr->name.vpid = ORTE_VPID_INVALID;
ptr->name.epoch = ORTE_EPOCH_INVALID;
while (NULL != (item = opal_list_remove_first(&ptr->last_msg))) {
OBJ_RELEASE(item);
}

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -678,6 +681,7 @@ static int tcp_recv(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
}
*seq_num = recvptr->seq_num;
*msg = recvptr->iovec_array;
@ -772,6 +776,7 @@ static int tcp_recv_buffer(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
}
*seq_num = recvptr->seq_num;
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -457,6 +460,7 @@ static int udp_recv(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
}
*seq_num = recvptr->seq_num;
*msg = recvptr->iovec_array;
@ -549,6 +553,7 @@ static int udp_recv_buffer(orte_process_name_t *name,
/* caller requested id of sender */
name->jobid = recvptr->name.jobid;
name->vpid = recvptr->name.vpid;
name->epoch= recvptr->name.epoch;
}
*seq_num = recvptr->seq_num;
if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {

Просмотреть файл

@ -1,4 +1,7 @@
/*
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -64,12 +67,14 @@ static void msg_pkt_constructor(orte_msg_packet_t *pkt)
{
pkt->sender.jobid = ORTE_JOBID_INVALID;
pkt->sender.vpid = ORTE_VPID_INVALID;
pkt->sender.epoch = ORTE_EPOCH_INVALID;
pkt->buffer = NULL;
}
static void msg_pkt_destructor(orte_msg_packet_t *pkt)
{
pkt->sender.jobid = ORTE_JOBID_INVALID;
pkt->sender.vpid = ORTE_VPID_INVALID;
pkt->sender.epoch = ORTE_EPOCH_INVALID;
if (NULL != pkt->buffer) {
OBJ_RELEASE(pkt->buffer);
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -138,10 +138,15 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
ORTE_PROC_IS_DAEMON &&
orte_process_info.num_procs < num_procs) {
orte_process_info.num_procs = num_procs;
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* if we changed it, then we better update the routed
* tree so daemon collectives work correctly
*/
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -360,6 +360,7 @@ rml_oob_queued_progress(int fd, short event, void *arg)
origin = hdr->origin;
next = orte_routed.get_route(&hdr->destination);
#if 0
if (next.vpid == ORTE_VPID_INVALID) {
opal_output(0,
"%s:queued progress tried routing message from %s to %s:%d, can't find route",
@ -370,6 +371,7 @@ rml_oob_queued_progress(int fd, short event, void *arg)
opal_backtrace_print(stderr);
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
}
#endif
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
opal_output(0, "%s:queued progress trying to get message from %s to %s:%d, routing loop",
@ -467,6 +469,7 @@ rml_oob_recv_route_callback(int status,
next = orte_routed.get_route(&hdr->destination);
if (next.vpid == ORTE_VPID_INVALID) {
#if 0
opal_output(0, "%s:route_callback tried routing message from %s to %s:%d, can't find route",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&origin),
@ -474,6 +477,8 @@ rml_oob_recv_route_callback(int status,
hdr->tag);
opal_backtrace_print(stderr);
orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
#endif
return;
}
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {

Просмотреть файл

@ -2,6 +2,9 @@
* Copyright (c) 2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -77,7 +80,8 @@ orte_rml_oob_purge(orte_process_name_t *peer)
orte_rml_oob_queued_msg_t *qmsg;
orte_rml_oob_msg_header_t *hdr;
orte_process_name_t step;
orte_ns_cmp_bitmask_t mask;
/* clear the oob contact info and pending messages */
orte_rml_oob_module.active_oob->oob_set_addr(peer, NULL);
@ -89,12 +93,14 @@ orte_rml_oob_purge(orte_process_name_t *peer)
qmsg = (orte_rml_oob_queued_msg_t*)item;
hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base;
step = orte_routed.get_route(&hdr->destination);
if (peer->jobid == hdr->destination.jobid &&
peer->vpid == hdr->destination.vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, peer, &hdr->destination)) {
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
OBJ_RELEASE(item);
} else if (step.jobid == hdr->destination.jobid &&
step.vpid == hdr->destination.vpid) {
} else if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &step, &hdr->destination)) {
opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item);
OBJ_RELEASE(item);
}

Просмотреть файл

@ -1,4 +1,7 @@
/*
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -17,7 +20,6 @@
#include "rml_oob.h"
static void
orte_rml_recv_msg_callback(int status,
struct orte_process_name_t* peer,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -62,6 +62,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
pkt = OBJ_NEW(orte_msg_packet_t); \
pkt->sender.jobid = (sndr)->jobid; \
pkt->sender.vpid = (sndr)->vpid; \
pkt->sender.epoch = (sndr)->epoch; \
if ((crt)) { \
pkt->buffer = OBJ_NEW(opal_buffer_t); \
opal_dss.copy_payload(pkt->buffer, *(buf)); \
@ -84,6 +85,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
pkt = OBJ_NEW(orte_msg_packet_t); \
pkt->sender.jobid = (sndr)->jobid; \
pkt->sender.vpid = (sndr)->vpid; \
pkt->sender.epoch = (sndr)->epoch; \
if ((crt)) { \
pkt->buffer = OBJ_NEW(opal_buffer_t); \
opal_dss.copy_payload(pkt->buffer, *(buf)); \
@ -189,6 +191,9 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_msg_packet_t);
#define ORTE_RML_TAG_SUBSCRIBE 46
/* For Epoch Updates */
#define ORTE_RML_TAG_EPOCH_CHANGE 47
#define ORTE_RML_TAG_MAX 100

Просмотреть файл

@ -4,6 +4,9 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -62,6 +65,7 @@ static void jfamconst(orte_routed_jobfam_t *ptr)
{
ptr->route.jobid = ORTE_JOBID_INVALID;
ptr->route.vpid = ORTE_VPID_INVALID;
ptr->route.epoch = ORTE_EPOCH_INVALID;
ptr->hnp_uri = NULL;
}
static void jfamdest(orte_routed_jobfam_t *ptr)
@ -113,6 +117,7 @@ orte_routed_base_open(void)
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->route.jobid = ORTE_PROC_MY_HNP->jobid;
jfam->route.vpid = ORTE_PROC_MY_HNP->vpid;
jfam->route.epoch = ORTE_PROC_MY_HNP->epoch;
jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
if (NULL != orte_process_info.my_hnp_uri) {
jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri);
@ -247,6 +252,7 @@ void orte_routed_base_update_hnps(opal_buffer_t *buf)
jfam->job_family = jobfamily;
jfam->route.jobid = name.jobid;
jfam->route.vpid = name.vpid;
jfam->route.epoch = name.epoch;
jfam->hnp_uri = strdup(uri);
done:
free(uri);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -127,6 +127,7 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
orte_std_cntr_t cnt;
char *rml_uri;
orte_vpid_t vpid;
orte_epoch_t epoch;
int rc;
if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
@ -144,13 +145,18 @@ int orte_routed_base_process_callback(orte_jobid_t job, opal_buffer_t *buffer)
/* unpack the data for each entry */
cnt = 1;
while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &cnt, ORTE_VPID))) {
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &epoch, &cnt, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
continue;
}
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
continue;
}
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routed_binomial:callback got uri %s for job %s rank %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,6 +29,7 @@
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/runtime.h"
@ -44,7 +48,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(void);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
@ -143,7 +147,8 @@ static int delete_route(orte_process_name_t *proc)
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -211,7 +216,8 @@ static int update_route(orte_process_name_t *target,
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -269,6 +275,8 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
return ORTE_SUCCESS;
}
}
@ -282,6 +290,8 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
@ -304,11 +314,12 @@ static orte_process_name_t get_route(orte_process_name_t *target)
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
ret = ORTE_NAME_INVALID;
goto found;
}
/* if it is me, then the route is just direct */
if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
ret = target;
@ -376,48 +387,55 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/* THIS CAME FROM OUR OWN JOB FAMILY... */
/* if we are not using static ports and this is going to the HNP, send direct */
if (!orte_static_ports &&
ORTE_PROC_MY_HNP->jobid == target->jobid &&
ORTE_PROC_MY_HNP->vpid == target->vpid) {
if( !orte_static_ports &&
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing not enabled - going direct",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ret = target;
"%s routing to the HNP through my PLM parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
ret = ORTE_PROC_MY_PARENT;
goto found;
}
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
/* find out what daemon hosts this proc */
if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
/*ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);*/
ret = ORTE_NAME_INVALID;
goto found;
}
/* if the daemon is me, then send direct to the target! */
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
ret = target;
goto found;
} else {
/* search routing tree for next step to that daemon */
for (item = opal_list_get_first(&my_children);
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
if (child->vpid == daemon.vpid) {
/* the child is hosting the proc - just send it there */
ret = &daemon;
goto found;
}
/* otherwise, see if the daemon we need is below the child */
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
/* yep - we need to step through this child */
daemon.vpid = child->vpid;
ret = &daemon;
goto found;
}
startover:
/* search routing tree for next step to that daemon */
for (item = opal_list_get_first(&my_children);
item != opal_list_get_end(&my_children);
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
if (child->vpid == daemon.vpid) {
/* the child is hosting the proc - just send it there */
ret = &daemon;
goto found;
}
/* otherwise, see if the daemon we need is below the child */
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
/* yep - we need to step through this child */
daemon.vpid = child->vpid;
/* If the daemon to which we should be routing is dead, then update
* the routing tree and start over. */
if (!orte_util_proc_is_running(&daemon)) {
update_routing_tree(daemon.jobid);
goto startover;
}
ret = &daemon;
goto found;
}
}
@ -425,9 +443,12 @@ static orte_process_name_t get_route(orte_process_name_t *target)
* any of our children, so we have to step up through our parent
*/
daemon.vpid = my_parent.vpid;
ret = &daemon;
found:
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
"%s routed_binomial_get(%s) --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -843,17 +864,22 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = proc->epoch;
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int binomial_tree(int rank, int parent, int me, int num_procs,
int *nchildren, opal_list_t *childrn, opal_bitmap_t *relatives)
int *nchildren, opal_list_t *childrn,
opal_bitmap_t *relatives, bool mine, orte_jobid_t jobid)
{
int i, bitmap, peer, hibit, mask, found;
orte_routed_tree_t *child;
opal_bitmap_t *relations;
orte_process_name_t proc_name;
proc_name.jobid = jobid;
/* is this me? */
if (me == rank) {
@ -868,15 +894,43 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
child = OBJ_NEW(orte_routed_tree_t);
child->vpid = peer;
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
"%s routed:binomial found child %s",
"%s routed:binomial %d found child %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
rank,
ORTE_VPID_PRINT(child->vpid)));
if (NULL != childrn) {
/* If the process we are looking at next is already dead, then
* we inherit its children. Keep up with the process name of
* that process so we can check it's state.
*/
proc_name.vpid = peer;
proc_name.epoch = orte_util_lookup_epoch(&proc_name);
if (!orte_util_proc_is_running(&proc_name)
&& ORTE_EPOCH_MIN < proc_name.epoch
&& ORTE_EPOCH_INVALID != proc_name.epoch) {
OPAL_OUTPUT_VERBOSE((3, orte_routed_base_output,
"%s routed:binomial child %s is dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(child->vpid)));
relations = relatives;
/* Leave mine as it is. If it was true, then we want to
* inherit the dead node's children as our own. If it wasn't
* then we want it's relatives as our own. */
binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, mine, jobid);
/* If we use the proc_is_running as a way of measuring of the
* process is dead, then we get screwed up on startup. By also
* testing the epoch, we make sure that the process really did
* start up and then died. */
} else if (mine) {
/* this is a direct child - add it to my list */
opal_list_append(childrn, &child->super);
(*nchildren)++;
/* setup the relatives bitmap */
opal_bitmap_init(&child->relatives, num_procs);
/* point to the relatives */
relations = &child->relatives;
} else {
@ -886,7 +940,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
relations = relatives;
}
/* search for this child's relatives */
binomial_tree(0, 0, peer, num_procs, NULL, NULL, relations);
binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false, jobid);
}
}
return parent;
@ -902,7 +956,13 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
peer = rank | mask;
if (peer < num_procs) {
/* execute compute on this child */
if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives))) {
if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine, jobid))) {
proc_name.vpid = found;
if (!orte_util_proc_is_running(&proc_name) && ORTE_EPOCH_MIN < orte_util_lookup_epoch(&proc_name)) {
return parent;
}
return found;
}
}
@ -910,7 +970,7 @@ static int binomial_tree(int rank, int parent, int me, int num_procs,
return -1;
}
static int update_routing_tree(void)
static int update_routing_tree(orte_jobid_t jobid)
{
orte_routed_tree_t *child;
int j;
@ -933,8 +993,9 @@ static int update_routing_tree(void)
* lie underneath their branch
*/
my_parent.vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid,
orte_process_info.num_procs,
&num_children, &my_children, NULL);
orte_process_info.max_procs,
&num_children, &my_children, NULL, true, jobid);
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
if (0 < opal_output_get_verbosity(orte_routed_base_output)) {
opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), my_parent.vpid, num_children);
@ -943,7 +1004,7 @@ static int update_routing_tree(void)
item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item;
opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid);
for (j=0; j < (int)orte_process_info.num_procs; j++) {
for (j=0; j < (int)orte_process_info.max_procs; j++) {
if (opal_bitmap_is_set_bit(&child->relatives, j)) {
opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
}

Просмотреть файл

@ -4,6 +4,9 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -47,7 +50,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(void);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
@ -135,7 +138,8 @@ static int delete_route(orte_process_name_t *proc)
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -195,7 +199,8 @@ static int update_route(orte_process_name_t *target,
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -252,6 +257,8 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
return ORTE_SUCCESS;
}
}
@ -265,6 +272,8 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = orte_ess.proc_get_epoch(&jfam->route);
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
@ -287,7 +296,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
ret = ORTE_NAME_INVALID;
goto found;
}
@ -354,6 +364,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
goto found;
}
/* Initialize daemon's epoch, based on its current vpid/jobid */
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
/* if the daemon is me, then send direct to the target! */
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
ret = target;
@ -798,12 +811,14 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int update_routing_tree(void)
static int update_routing_tree(orte_jobid_t jobid)
{
/* nothing to do here */
return ORTE_SUCCESS;

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -36,7 +39,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(void);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
@ -131,7 +134,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
orte_process_name_t *ret;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
ret = ORTE_NAME_INVALID;
} else {
/* all routes are direct */
@ -305,7 +309,7 @@ static int set_lifeline(orte_process_name_t *proc)
return ORTE_SUCCESS;
}
static int update_routing_tree(void)
static int update_routing_tree(orte_jobid_t jobid)
{
/* nothing to do here */
return ORTE_SUCCESS;

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -43,7 +46,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(void);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
@ -126,7 +129,8 @@ static int delete_route(orte_process_name_t *proc)
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -194,7 +198,8 @@ static int update_route(orte_process_name_t *target,
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -252,6 +257,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
return ORTE_SUCCESS;
}
}
@ -265,6 +271,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
@ -338,14 +345,14 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */
/* if we are not using static ports and this is going to the HNP, send direct */
if (!orte_static_ports &&
ORTE_PROC_MY_HNP->jobid == target->jobid &&
ORTE_PROC_MY_HNP->vpid == target->vpid) {
/* if we are not using static ports and this is going to the HNP, send directly through my parent */
if( !orte_static_ports &&
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing not enabled - going direct",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ret = target;
"%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
ret = ORTE_PROC_MY_PARENT;
goto found;
}
@ -357,6 +364,9 @@ static orte_process_name_t get_route(orte_process_name_t *target)
goto found;
}
/* Initialize daemon's epoch, based on its current vpid/jobid */
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
/* if the daemon is me, then send direct to the target! */
if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
ret = target;
@ -376,6 +386,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* we are at end of chain - wrap around */
daemon.vpid = 0;
}
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ret = &daemon;
}
}
@ -715,12 +726,13 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = proc->epoch;
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int update_routing_tree(void)
static int update_routing_tree(orte_jobid_t jobid)
{
/* if I am anything other than a daemon or the HNP, this
* is a meaningless command as I am not allowed to route

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -44,7 +47,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(void);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
@ -142,7 +145,8 @@ static int delete_route(orte_process_name_t *proc)
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
proc->vpid == ORTE_VPID_INVALID ||
proc->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -210,7 +214,8 @@ static int update_route(orte_process_name_t *target,
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
return ORTE_ERR_BAD_PARAM;
}
@ -268,6 +273,7 @@ static int update_route(orte_process_name_t *target,
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
return ORTE_SUCCESS;
}
}
@ -281,6 +287,7 @@ static int update_route(orte_process_name_t *target,
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
jfam->route.epoch = route->epoch;
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
@ -303,7 +310,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
ret = ORTE_NAME_INVALID;
goto found;
}
@ -362,14 +370,14 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/* THIS CAME FROM OUR OWN JOB FAMILY... */
/* if we are not using static ports and this is going to the HNP, send direct */
if (!orte_static_ports &&
ORTE_PROC_MY_HNP->jobid == target->jobid &&
ORTE_PROC_MY_HNP->vpid == target->vpid) {
/* if we are not using static ports and this is going to the HNP, send directly through my parent */
if( !orte_static_ports &&
OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) ) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
"%s routing not enabled - going direct",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ret = target;
"%s routing to the HNP through my parent %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
ret = ORTE_PROC_MY_PARENT;
goto found;
}
@ -400,6 +408,7 @@ static orte_process_name_t get_route(orte_process_name_t *target)
if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
/* yep - we need to step through this child */
daemon.vpid = child->vpid;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ret = &daemon;
goto found;
}
@ -410,6 +419,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
* any of our children, so we have to step up through our parent
*/
daemon.vpid = my_parent.vpid;
daemon.epoch = orte_ess.proc_get_epoch(&daemon);
ret = &daemon;
found:
@ -765,6 +776,7 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = proc->epoch;
lifeline = &local_lifeline;
return ORTE_SUCCESS;
@ -815,7 +827,7 @@ static void radix_tree(int rank, int *num_children,
}
}
static int update_routing_tree(void)
static int update_routing_tree(orte_jobid_t jobid)
{
orte_routed_tree_t *child;
int j;
@ -857,6 +869,7 @@ static int update_routing_tree(void)
my_parent.vpid = (Ii-Sum) % NInPrevLevel;
my_parent.vpid += (Sum - NInPrevLevel);
}
my_parent.epoch = orte_ess.proc_get_epoch(&my_parent);
/* compute my direct children and the bitmap that shows which vpids
* lie underneath their branch

Просмотреть файл

@ -3,6 +3,9 @@
* All rights reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -191,10 +194,12 @@ typedef int (*orte_routed_module_get_wireup_info_fn_t)(opal_buffer_t *buf);
* of "leaves" for this process and identifies the vpid of the parent
* sitting above this process in the tree.
*
* @param [in] jobid The jobid of the routing tree that needs to be updated.
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR_xxx The specifed error occurred
*/
typedef int (*orte_routed_module_update_routing_tree_fn_t)(void);
typedef int (*orte_routed_module_update_routing_tree_fn_t)(orte_jobid_t jobid);
/*
* Get the routing tree for this process

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -16,6 +19,7 @@
#include "opal/util/opal_sos.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/util/name_fns.h"
@ -37,7 +41,7 @@ static orte_process_name_t get_route(orte_process_name_t *target);
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat);
static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static int update_routing_tree(void);
static int update_routing_tree(orte_jobid_t jobid);
static orte_vpid_t get_routing_tree(opal_list_t *children);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
@ -129,7 +133,8 @@ static orte_process_name_t get_route(orte_process_name_t *target)
orte_process_name_t *ret;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
target->vpid == ORTE_VPID_INVALID ||
target->epoch == ORTE_EPOCH_INVALID) {
ret = ORTE_NAME_INVALID;
} else {
/* a slave must always route via its parent daemon */
@ -251,9 +256,12 @@ static int route_lost(const orte_process_name_t *route)
static bool route_is_defined(const orte_process_name_t *target)
{
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_ALL;
/* only the route to my daemon is defined */
if (target->jobid != ORTE_PROC_MY_DAEMON->jobid ||
target->vpid != ORTE_PROC_MY_DAEMON->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, target, ORTE_PROC_MY_DAEMON)) {
return false;
}
@ -267,12 +275,14 @@ static int set_lifeline(orte_process_name_t *proc)
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
local_lifeline.epoch = orte_ess.proc_get_epoch(&local_lifeline);
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int update_routing_tree(void)
static int update_routing_tree(orte_jobid_t jobid)
{
/* this is a meaningless command for a slave as I am not allowed to route */
return ORTE_ERR_NOT_SUPPORTED;

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -67,6 +70,7 @@ typedef struct {
opal_list_item_t super;
orte_jobid_t jobid;
orte_vpid_t vpid;
orte_epoch_t epoch;
char *file;
int tick;
bool check_size;

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
@ -81,6 +81,7 @@ void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
@ -91,6 +92,7 @@ void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
@ -468,12 +470,15 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
{
int ret, exit_status = ORTE_SUCCESS;
orte_std_cntr_t count = 1;
orte_ns_cmp_bitmask_t mask;
mask = ORTE_NS_CMP_ALL;
/*
* Do not send to self, as that is silly.
*/
if (peer->jobid == ORTE_PROC_MY_HNP->jobid &&
peer->vpid == ORTE_PROC_MY_HNP->vpid ) {
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
"%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
@ -650,6 +655,7 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
char *global_snapshot_handle = NULL;
char *tmp_str = NULL;
int seq_num;
orte_ns_cmp_bitmask_t mask;
/*
* Noop if invalid peer, or peer not specified (JJH Double check this)
@ -660,11 +666,12 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
return ORTE_SUCCESS;
}
mask = ORTE_NS_CMP_ALL;
/*
* Do not send to self, as that is silly.
*/
if (peer->jobid == ORTE_PROC_MY_HNP->jobid &&
peer->vpid == ORTE_PROC_MY_HNP->vpid ) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
"%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
@ -427,6 +427,7 @@ int global_coord_start_ckpt(orte_snapc_base_quiesce_t *datum)
new_proc = OBJ_NEW(orte_proc_t);
new_proc->name.jobid = proc->name.jobid;
new_proc->name.vpid = proc->name.vpid;
new_proc->name.epoch = proc->name.epoch;
new_proc->node = OBJ_NEW(orte_node_t);
new_proc->node->name = proc->node->name;
opal_list_append(migrating_procs, &new_proc->super);
@ -590,6 +591,7 @@ static int global_init_job_structs(void)
orte_proc_t **procs = NULL;
orte_std_cntr_t i = 0;
orte_vpid_t p = 0;
orte_ns_cmp_bitmask_t mask;
/* look up job data object */
if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) {
@ -616,9 +618,12 @@ static int global_init_job_structs(void)
orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid;
orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid;
orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch;
if( orted_snapshot->process_name.jobid == ORTE_PROC_MY_NAME->jobid &&
orted_snapshot->process_name.vpid == ORTE_PROC_MY_NAME->vpid ) {
mask = ORTE_NS_CMP_JOBID;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, &orted_snapshot->process_name, ORTE_PROC_MY_NAME)) {
global_coord_has_local_children = true;
}
@ -631,6 +636,7 @@ static int global_init_job_structs(void)
app_snapshot->process_name.jobid = procs[p]->name.jobid;
app_snapshot->process_name.vpid = procs[p]->name.vpid;
app_snapshot->process_name.epoch = procs[p]->name.epoch;
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
}
@ -657,6 +663,7 @@ static int global_refresh_job_structs(void)
orte_std_cntr_t i = 0;
orte_vpid_t p = 0;
bool found = false;
orte_ns_cmp_bitmask_t mask;
/* look up job data object */
if (NULL == (jdata = orte_get_job_data_object(current_global_jobid))) {
@ -793,6 +800,7 @@ static int global_refresh_job_structs(void)
app_snapshot->process_name.jobid = procs[p]->name.jobid;
app_snapshot->process_name.vpid = procs[p]->name.vpid;
app_snapshot->process_name.epoch = procs[p]->name.epoch;
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
}
@ -808,9 +816,12 @@ static int global_refresh_job_structs(void)
orted_snapshot->process_name.jobid = cur_node->daemon->name.jobid;
orted_snapshot->process_name.vpid = cur_node->daemon->name.vpid;
orted_snapshot->process_name.epoch = cur_node->daemon->name.epoch;
if( orted_snapshot->process_name.jobid == ORTE_PROC_MY_NAME->jobid &&
orted_snapshot->process_name.vpid == ORTE_PROC_MY_NAME->vpid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, &orted_snapshot->process_name, ORTE_PROC_MY_NAME)) {
global_coord_has_local_children = true;
}
for(p = 0; p < cur_node->num_procs; ++p) {
@ -826,6 +837,7 @@ static int global_refresh_job_structs(void)
app_snapshot->process_name.jobid = procs[p]->name.jobid;
app_snapshot->process_name.vpid = procs[p]->name.vpid;
app_snapshot->process_name.epoch = procs[p]->name.epoch;
opal_list_append(&(orted_snapshot->super.local_snapshots), &(app_snapshot->super));
}
@ -2375,14 +2387,17 @@ static orte_snapc_full_orted_snapshot_t *find_orted_snapshot(orte_process_name_t
orte_snapc_full_orted_snapshot_t *orted_snapshot = NULL;
opal_list_item_t* item = NULL;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(&(global_snapshot.local_snapshots));
item != opal_list_get_end(&(global_snapshot.local_snapshots));
item = opal_list_get_next(item) ) {
orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item;
if( name->jobid == orted_snapshot->process_name.jobid &&
name->vpid == orted_snapshot->process_name.vpid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, name, &orted_snapshot->process_name)) {
return orted_snapshot;
}
}
@ -2404,8 +2419,10 @@ static orte_snapc_full_orted_snapshot_t *find_orted_snapshot(orte_process_name_t
item = opal_list_get_next(item) ) {
orted_snapshot = (orte_snapc_full_orted_snapshot_t*)item;
if( name->jobid == orted_snapshot->process_name.jobid &&
name->vpid == orted_snapshot->process_name.vpid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, name, &orted_snapshot->process_name)) {
return orted_snapshot;
}
}

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
@ -2033,6 +2033,7 @@ static int snapc_full_local_get_vpids(void)
vpid_snapshot->process_pid = child->pid;
vpid_snapshot->super.process_name.jobid = child->name->jobid;
vpid_snapshot->super.process_name.vpid = child->name->vpid;
vpid_snapshot->super.process_name.epoch = child->name->epoch;
}
}
@ -2094,6 +2095,7 @@ static int snapc_full_local_refresh_vpids(void)
vpid_snapshot->process_pid = child->pid;
vpid_snapshot->super.process_name.jobid = child->name->jobid;
vpid_snapshot->super.process_name.vpid = child->name->vpid;
vpid_snapshot->super.process_name.epoch = child->name->epoch;
/*vpid_snapshot->migrating = true;*/
opal_list_append(&(local_global_snapshot.local_snapshots), &(vpid_snapshot->super.super));
@ -2109,6 +2111,7 @@ static int snapc_full_local_refresh_vpids(void)
vpid_snapshot->process_pid = child->pid;
vpid_snapshot->super.process_name.jobid = child->name->jobid;
vpid_snapshot->super.process_name.vpid = child->name->vpid;
vpid_snapshot->super.process_name.epoch = child->name->epoch;
}
}
@ -2119,14 +2122,17 @@ static orte_snapc_full_app_snapshot_t *find_vpid_snapshot(orte_process_name_t *n
{
opal_list_item_t* item = NULL;
orte_snapc_full_app_snapshot_t *vpid_snapshot = NULL;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(&(local_global_snapshot.local_snapshots));
item != opal_list_get_end(&(local_global_snapshot.local_snapshots));
item = opal_list_get_next(item) ) {
vpid_snapshot = (orte_snapc_full_app_snapshot_t*)item;
if( name->jobid == vpid_snapshot->super.process_name.jobid &&
name->vpid == vpid_snapshot->super.process_name.vpid ) {
mask = ORTE_NS_CMP_JOBID;
if (OPAL_EQUAL ==
orte_util_compare_name_fields(mask, name, &vpid_snapshot->super.process_name)) {
return vpid_snapshot;
}
}

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
@ -83,6 +83,7 @@ OBJ_CLASS_INSTANCE(orte_snapc_full_app_snapshot_t,
void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
}
@ -90,6 +91,7 @@ void orte_snapc_full_orted_construct(orte_snapc_full_orted_snapshot_t *snapshot)
void orte_snapc_full_orted_destruct( orte_snapc_full_orted_snapshot_t *snapshot) {
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = 0;
snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
}

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -59,6 +62,7 @@ void orte_sstore_base_local_snapshot_info_construct(orte_sstore_base_local_snaps
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
snapshot->crs_comp = NULL;
snapshot->compress_comp = NULL;
@ -72,6 +76,7 @@ void orte_sstore_base_local_snapshot_info_destruct( orte_sstore_base_local_snaps
{
snapshot->process_name.jobid = 0;
snapshot->process_name.vpid = 0;
snapshot->process_name.epoch = ORTE_EPOCH_MIN;
if( NULL != snapshot->crs_comp ) {
free(snapshot->crs_comp);
@ -632,6 +637,7 @@ int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_in
vpid_snapshot->process_name.jobid = proc.jobid;
vpid_snapshot->process_name.vpid = proc.vpid;
vpid_snapshot->process_name.epoch = proc.epoch;
}
else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) {
vpid_snapshot->crs_comp = strdup(value);

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -1212,6 +1215,7 @@ static int orte_sstore_central_extract_global_metadata(orte_sstore_central_globa
vpid_snapshot->process_name.jobid = handle_info->jobid;
vpid_snapshot->process_name.vpid = i;
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
vpid_snapshot->crs_comp = NULL;
global_snapshot->start_time = NULL;

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -207,6 +210,7 @@ void orte_sstore_central_local_app_snapshot_info_construct(orte_sstore_central_l
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_INVALID;
info->local_location = NULL;
info->metadata_filename = NULL;
@ -218,6 +222,7 @@ void orte_sstore_central_local_app_snapshot_info_destruct( orte_sstore_central_l
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_INVALID;
if( NULL != info->local_location ) {
free(info->local_location);
@ -530,6 +535,7 @@ static int append_new_app_handle_info(orte_sstore_central_local_snapshot_info_t
app_info->name.jobid = name->jobid;
app_info->name.vpid = name->vpid;
app_info->name.epoch = name->epoch;
opal_list_append(handle_info->app_info_handle, &(app_info->super));
@ -541,14 +547,16 @@ static orte_sstore_central_local_app_snapshot_info_t *find_app_handle_info(orte_
{
orte_sstore_central_local_app_snapshot_info_t *app_info = NULL;
opal_list_item_t* item = NULL;
orte_bs_cmp_bitmask_t mask;
for(item = opal_list_get_first(handle_info->app_info_handle);
item != opal_list_get_end(handle_info->app_info_handle);
item = opal_list_get_next(item) ) {
app_info = (orte_sstore_central_local_app_snapshot_info_t*)item;
if( app_info->name.jobid == name->jobid &&
app_info->name.vpid == name->vpid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &app_info->name, name)) {
return app_info;
}
}

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -1214,8 +1217,10 @@ static int process_local_push(orte_process_name_t* peer, opal_buffer_t* buffer,
p_set = OBJ_NEW(orte_filem_base_process_set_t);
p_set->source.jobid = peer->jobid;
p_set->source.vpid = peer->vpid;
p_set->source.epoch = peer->epoch;
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
}
@ -1700,6 +1705,7 @@ static int orte_sstore_stage_extract_global_metadata(orte_sstore_stage_global_sn
vpid_snapshot->process_name.jobid = handle_info->jobid;
vpid_snapshot->process_name.vpid = i;
vpid_snapshot->process_name.epoch = orte_ess.proc_get_epoch(&vpid_snapshot->process_name);
/* JJH: Currently we do not have this information since we do not save
* individual vpid info in the Global SStore. It is in the metadata

Просмотреть файл

@ -1,6 +1,9 @@
/*
* Copyright (c) 2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -284,6 +287,7 @@ void orte_sstore_stage_local_app_snapshot_info_construct(orte_sstore_stage_local
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_INVALID;
info->local_location = NULL;
info->compressed_local_location = NULL;
@ -298,6 +302,7 @@ void orte_sstore_stage_local_app_snapshot_info_destruct( orte_sstore_stage_local
{
info->name.jobid = ORTE_JOBID_INVALID;
info->name.vpid = ORTE_VPID_INVALID;
info->name.epoch = ORTE_EPOCH_INVALID;
if( NULL != info->local_location ) {
free(info->local_location);
@ -1009,6 +1014,7 @@ static int append_new_app_handle_info(orte_sstore_stage_local_snapshot_info_t *h
app_info->name.jobid = name->jobid;
app_info->name.vpid = name->vpid;
app_info->name.epoch = name->epoch;
opal_list_append(handle_info->app_info_handle, &(app_info->super));
@ -1020,14 +1026,16 @@ static orte_sstore_stage_local_app_snapshot_info_t *find_app_handle_info(orte_ss
{
orte_sstore_stage_local_app_snapshot_info_t *app_info = NULL;
opal_list_item_t* item = NULL;
orte_ns_cmp_bitmask_t mask;
for(item = opal_list_get_first(handle_info->app_info_handle);
item != opal_list_get_end(handle_info->app_info_handle);
item = opal_list_get_next(item) ) {
app_info = (orte_sstore_stage_local_app_snapshot_info_t*)item;
if( app_info->name.jobid == name->jobid &&
app_info->name.vpid == name->vpid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &app_info->name, name)) {
return app_info;
}
}
@ -2049,14 +2057,17 @@ static int orte_sstore_stage_local_preload_files(char **local_location, bool *sk
/* if I am the HNP, then use me as the source */
p_set->source.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->source.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->source.epoch = ORTE_PROC_MY_NAME->epoch;
}
else {
/* otherwise, set the HNP as the source */
p_set->source.jobid = ORTE_PROC_MY_HNP->jobid;
p_set->source.vpid = ORTE_PROC_MY_HNP->vpid;
p_set->source.epoch = ORTE_PROC_MY_HNP->epoch;
}
p_set->sink.jobid = ORTE_PROC_MY_NAME->jobid;
p_set->sink.vpid = ORTE_PROC_MY_NAME->vpid;
p_set->sink.epoch = ORTE_PROC_MY_NAME->epoch;
opal_list_append(&(filem_request->process_sets), &(p_set->super) );
/* Define the file set */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -27,6 +27,7 @@
#endif
#include "opal/dss/dss_types.h"
#include "opal/class/opal_pointer_array.h"
#include "orte/mca/rml/rml_types.h"
BEGIN_C_DECLS
@ -50,4 +51,7 @@ ORTE_DECLSPEC extern struct timeval orte_daemon_msg_recvd;
END_C_DECLS
/* Local function */
int send_to_local_applications(opal_pointer_array_t *dead_names);
#endif /* ORTED_H */

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше