temporarily back our r15517 and 15520 so that I can get the RML / OOB changes
to cleanly apply This commit was SVN r15527. The following SVN revision numbers were found above: r15517 --> open-mpi/ompi@41977fcc95
Этот коммит содержится в:
родитель
824ef791f9
Коммит
2d17dd9516
@ -1198,6 +1198,7 @@ AC_CONFIG_FILES([
|
||||
orte/include/Makefile
|
||||
orte/etc/Makefile
|
||||
|
||||
orte/tools/console/Makefile
|
||||
orte/tools/orteboot/Makefile
|
||||
orte/tools/orted/Makefile
|
||||
orte/tools/ortehalt/Makefile
|
||||
|
@ -1068,7 +1068,7 @@ int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high )
|
||||
ourproc = intercomm->c_local_group->grp_proc_pointers[0];
|
||||
theirproc = intercomm->c_remote_group->grp_proc_pointers[0];
|
||||
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
rc = orte_ns.compare_fields(mask, &(ourproc->proc_name), &(theirproc->proc_name));
|
||||
if ( 0 > rc ) {
|
||||
flag = true;
|
||||
|
@ -26,7 +26,6 @@
|
||||
#include "btl_base_error.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
int mca_btl_base_debug;
|
||||
|
||||
@ -60,7 +59,8 @@ void mca_btl_base_error_no_nics(const char* transport,
|
||||
char *procid;
|
||||
if (mca_btl_base_warn_component_unused) {
|
||||
/* print out no-nic warning if user told us to */
|
||||
asprintf(&procid, "%s", ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
asprintf(&procid, "[%lu,%lu,%lu]",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
|
||||
opal_show_help("help-mpi-btl-base.txt", "btl:no-nics",
|
||||
true, procid, transport, orte_system_info.nodename,
|
||||
|
@ -27,7 +27,6 @@
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
OMPI_DECLSPEC extern int mca_btl_base_debug;
|
||||
|
||||
@ -36,9 +35,9 @@ extern int mca_btl_base_out(const char*, ...);
|
||||
|
||||
#define BTL_OUTPUT(args) \
|
||||
do { \
|
||||
mca_btl_base_out("[%s]%s[%s:%d:%s] ", \
|
||||
mca_btl_base_out("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
|
||||
orte_system_info.nodename, \
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), \
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_btl_base_out args; \
|
||||
mca_btl_base_out("\n"); \
|
||||
@ -47,9 +46,9 @@ do { \
|
||||
|
||||
#define BTL_ERROR(args) \
|
||||
do { \
|
||||
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
|
||||
mca_btl_base_err("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
|
||||
orte_system_info.nodename, \
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), \
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_btl_base_err args; \
|
||||
mca_btl_base_err("\n"); \
|
||||
@ -57,8 +56,8 @@ do { \
|
||||
|
||||
#define BTL_PEER_ERROR(proc, args) \
|
||||
do { \
|
||||
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), \
|
||||
mca_btl_base_err("[%ld,%ld,%ld][%s:%d:%s] from %s ", \
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
orte_system_info.nodename); \
|
||||
if(proc && proc->proc_hostname) { \
|
||||
@ -73,9 +72,9 @@ do { \
|
||||
#define BTL_DEBUG(args) \
|
||||
do { \
|
||||
if(mca_btl_base_debug) { \
|
||||
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
|
||||
mca_btl_base_err("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
|
||||
orte_system_info.nodename, \
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), \
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_btl_base_err args; \
|
||||
mca_btl_base_err("\n"); \
|
||||
@ -84,9 +83,9 @@ do { \
|
||||
#define BTL_VERBOSE(args) \
|
||||
do { \
|
||||
if(mca_btl_base_debug > 1) { \
|
||||
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
|
||||
mca_btl_base_err("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
|
||||
orte_system_info.nodename, \
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), \
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_btl_base_err args; \
|
||||
mca_btl_base_err("\n"); \
|
||||
|
@ -429,10 +429,10 @@ static int mca_btl_gm_discover( void )
|
||||
|
||||
if(mca_btl_gm_component.gm_debug > 0) {
|
||||
opal_output(0,
|
||||
"%s gm_port %08lX, "
|
||||
"[%ld,%ld,%ld] gm_port %08lX, "
|
||||
"board %" PRIu32 ", global %" PRIu32 " "
|
||||
"node %" PRIu32 "port %" PRIu32 "\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
(unsigned long) port, board_no, global_id, node_id, port_no);
|
||||
}
|
||||
|
||||
|
@ -128,15 +128,15 @@ mca_btl_gm_proc_t* mca_btl_gm_proc_create(ompi_proc_t* ompi_proc)
|
||||
(void*)&gm_proc->proc_addrs,
|
||||
&size);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(gm_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_gm_addr_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid gm address for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] invalid gm address for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(gm_proc);
|
||||
return NULL;
|
||||
}
|
||||
@ -189,9 +189,9 @@ int mca_btl_gm_proc_insert(
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if(mca_btl_gm_component.gm_debug > 0) {
|
||||
opal_output(0, "%s mapped global id %" PRIu32
|
||||
opal_output(0, "[%ld,%ld,%ld] mapped global id %" PRIu32
|
||||
" to node id %" PRIu32 "\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
gm_endpoint->endpoint_addr.global_id,
|
||||
gm_endpoint->endpoint_addr.node_id);
|
||||
}
|
||||
|
@ -798,8 +798,8 @@ void mca_btl_mvapi_dump(
|
||||
opal_output( 0, "No endpoint for this peer\n" );
|
||||
return;
|
||||
}
|
||||
opal_output( 0, "endpoint with processor %s\n",
|
||||
ORTE_NAME_PRINT( &(endpoint->endpoint_proc->proc_ompi->proc_name) ) );
|
||||
opal_output( 0, "endpoint with processor (%lu.%lu.%lu)\n",
|
||||
ORTE_NAME_ARGS( &(endpoint->endpoint_proc->proc_ompi->proc_name) ) );
|
||||
opal_output( 0, "endpoint state: %s\n",
|
||||
(endpoint->endpoint_state == MCA_BTL_IB_CONNECTING ? "connecting" :
|
||||
(endpoint->endpoint_state == MCA_BTL_IB_CONNECT_ACK ? "waiting ack" :
|
||||
|
@ -140,15 +140,15 @@ mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc)
|
||||
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(mvapi_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_mvapi_port_info_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid mvapi address for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] invalid mvapi address for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(mvapi_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -121,8 +121,8 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
|
||||
rc = ompi_modex_recv( &mca_btl_mx_component.super.btl_version,
|
||||
ompi_proc, (void*)&mx_peers, &size );
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
opal_output( 0, "mca_pml_base_modex_recv failed for peer %s",
|
||||
ORTE_NAME_PRINT(&ompi_proc->proc_name) );
|
||||
opal_output( 0, "mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(&ompi_proc->proc_name) );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -130,8 +130,8 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
|
||||
return NULL;
|
||||
}
|
||||
if( (size % sizeof(mca_btl_mx_addr_t)) != 0 ) {
|
||||
opal_output( 0, "invalid mx address for peer %s",
|
||||
ORTE_NAME_PRINT(&ompi_proc->proc_name) );
|
||||
opal_output( 0, "invalid mx address for peer [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(&ompi_proc->proc_name) );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -127,15 +127,15 @@ mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0,
|
||||
"[%s:%d] ompi_modex_recv failed for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
"[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_ud_addr_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid module address for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -137,15 +137,15 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(mca_btl_base_output, "[%s:%d] ompi_modex_recv failed for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(mca_btl_base_output, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_openib_port_info_t)) != 0) {
|
||||
opal_output(mca_btl_base_output, "[%s:%d] invalid module address for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(mca_btl_base_output, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -470,8 +470,8 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
|
||||
ORTE_PROCESS_NAME_NTOH(guid);
|
||||
/* compare this to the expected values */
|
||||
if (0 != orte_ns.compare_fields(ORTE_NS_CMP_ALL, &btl_proc->proc_name, &guid)) {
|
||||
BTL_ERROR(("received unexpected process identifier %s",
|
||||
ORTE_NAME_PRINT(&guid)));
|
||||
BTL_ERROR(("received unexpected process identifier [%lu,%lu,%lu]",
|
||||
ORTE_NAME_ARGS(&guid)));
|
||||
mca_btl_tcp_endpoint_close(btl_endpoint);
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
@ -129,15 +129,15 @@ mca_btl_udapl_proc_t* mca_btl_udapl_proc_create(ompi_proc_t* ompi_proc)
|
||||
(void*)&udapl_proc->proc_addrs,
|
||||
&size);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(udapl_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_udapl_addr_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid udapl address for peer %s",
|
||||
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
|
||||
opal_output(0, "[%s:%d] invalid udapl address for peer [%lu,%lu,%lu]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(udapl_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -435,6 +435,7 @@ void ompi_crcp_coord_pml_message_ref_construct(ompi_crcp_coord_pml_message_ref_t
|
||||
msg_ref->comm = NULL;
|
||||
msg_ref->request = NULL;
|
||||
|
||||
msg_ref->proc_name.cellid = ORTE_CELLID_INVALID;
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
|
||||
@ -470,6 +471,7 @@ void ompi_crcp_coord_pml_message_ref_destruct( ompi_crcp_coord_pml_message_ref_t
|
||||
msg_ref->request = NULL;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = ORTE_CELLID_INVALID;
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
|
||||
@ -487,6 +489,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_peer_ref_t,
|
||||
ompi_crcp_coord_pml_peer_ref_destruct);
|
||||
|
||||
void ompi_crcp_coord_pml_peer_ref_construct(ompi_crcp_coord_pml_peer_ref_t *peer_ref) {
|
||||
peer_ref->proc_name.cellid = ORTE_CELLID_INVALID;
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
|
||||
@ -518,6 +521,7 @@ void ompi_crcp_coord_pml_peer_ref_construct(ompi_crcp_coord_pml_peer_ref_t *peer
|
||||
void ompi_crcp_coord_pml_peer_ref_destruct( ompi_crcp_coord_pml_peer_ref_t *peer_ref) {
|
||||
opal_list_item_t* item = NULL;
|
||||
|
||||
peer_ref->proc_name.cellid = ORTE_CELLID_INVALID;
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
|
||||
@ -589,6 +593,7 @@ OBJ_CLASS_INSTANCE(drain_msg_ack_ref_t,
|
||||
void drain_msg_ack_ref_construct(drain_msg_ack_ref_t *msg_ack_ref) {
|
||||
msg_ack_ref->complete = false;
|
||||
|
||||
msg_ack_ref->peer.cellid = ORTE_CELLID_INVALID;
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
}
|
||||
@ -596,6 +601,7 @@ void drain_msg_ack_ref_construct(drain_msg_ack_ref_t *msg_ack_ref) {
|
||||
void drain_msg_ack_ref_destruct( drain_msg_ack_ref_t *msg_ack_ref) {
|
||||
msg_ack_ref->complete = false;
|
||||
|
||||
msg_ack_ref->peer.cellid = ORTE_CELLID_INVALID;
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
}
|
||||
@ -644,7 +650,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_state_t,
|
||||
v_msg_ref = v_coord_state->msg_ref; \
|
||||
}
|
||||
|
||||
#define CREATE_NEW_MSG(msg_ref, v_type, v_buffer, v_count, v_datatype, v_tag, v_rank, v_comm, v_request, p_jobid, p_vpid) \
|
||||
#define CREATE_NEW_MSG(msg_ref, v_type, v_buffer, v_count, v_datatype, v_tag, v_rank, v_comm, v_request, p_cellid, p_jobid, p_vpid) \
|
||||
{ \
|
||||
msg_ref = OBJ_NEW(ompi_crcp_coord_pml_message_ref_t); \
|
||||
msg_ref->msg_id = message_seq_num; \
|
||||
@ -671,6 +677,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_state_t,
|
||||
OBJ_RETAIN(msg_ref->request); \
|
||||
} \
|
||||
\
|
||||
msg_ref->proc_name.cellid = p_cellid; \
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
}
|
||||
@ -701,6 +708,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_state_t,
|
||||
OBJ_RETAIN(msg_ref->request); \
|
||||
} \
|
||||
\
|
||||
dup_msg_ref->proc_name.cellid = msg_ref->proc_name.cellid; \
|
||||
dup_msg_ref->proc_name.jobid = msg_ref->proc_name.jobid; \
|
||||
dup_msg_ref->proc_name.vpid = msg_ref->proc_name.vpid; \
|
||||
}
|
||||
@ -876,6 +884,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_add_procs(
|
||||
for( i = 0; i < nprocs; ++i) {
|
||||
new_peer_ref = OBJ_NEW(ompi_crcp_coord_pml_peer_ref_t);
|
||||
|
||||
new_peer_ref->proc_name.cellid = procs[i]->proc_name.cellid;
|
||||
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
|
||||
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
|
||||
|
||||
@ -908,8 +917,8 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_del_procs(
|
||||
item = (opal_list_item_t*)find_peer(procs[i]->proc_name);
|
||||
if(NULL == item) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: del_procs: Unable to find peer %s\n",
|
||||
ORTE_NAME_PRINT(&(procs[i]->proc_name)));
|
||||
"crcp:coord: del_procs: Unable to find peer [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(&(procs[i]->proc_name)));
|
||||
exit_status = OMPI_ERROR;
|
||||
goto DONE;
|
||||
}
|
||||
@ -972,6 +981,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_isend_init(
|
||||
buf,
|
||||
count, datatype, tag, dst, comm,
|
||||
NULL,
|
||||
peer_ref->proc_name.cellid,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
|
||||
@ -1065,6 +1075,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_isend(
|
||||
CREATE_NEW_MSG(msg_ref, COORD_MSG_TYPE_I_SEND,
|
||||
buf,
|
||||
count, datatype, tag, dst, comm, NULL,
|
||||
peer_ref->proc_name.cellid,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
|
||||
@ -1160,6 +1171,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_send(
|
||||
CREATE_NEW_MSG(msg_ref, COORD_MSG_TYPE_B_SEND,
|
||||
buf,
|
||||
count, datatype, tag, dst, comm, NULL,
|
||||
peer_ref->proc_name.cellid,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
|
||||
@ -1252,6 +1264,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv_init(
|
||||
buf,
|
||||
count, datatype, tag, src, comm,
|
||||
NULL, /* Leave this NULL for now, will pick up real value in POST */
|
||||
ORTE_CELLID_INVALID,
|
||||
ORTE_JOBID_INVALID,
|
||||
ORTE_VPID_INVALID);
|
||||
|
||||
@ -1283,6 +1296,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv_init(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
|
||||
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
|
||||
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
|
||||
|
||||
@ -1438,6 +1452,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv(
|
||||
buf,
|
||||
count, datatype, tag, src, comm,
|
||||
NULL, /* Leave this NULL for now, will pick up real value in POST */
|
||||
ORTE_CELLID_INVALID,
|
||||
ORTE_JOBID_INVALID,
|
||||
ORTE_VPID_INVALID);
|
||||
|
||||
@ -1469,6 +1484,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
|
||||
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
|
||||
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
|
||||
|
||||
@ -1640,6 +1656,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
|
||||
buf,
|
||||
count, datatype, tag, src,
|
||||
comm, request,
|
||||
ORTE_CELLID_INVALID,
|
||||
ORTE_JOBID_INVALID,
|
||||
ORTE_VPID_INVALID);
|
||||
|
||||
@ -1671,6 +1688,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
|
||||
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
|
||||
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
|
||||
|
||||
@ -1704,6 +1722,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
|
||||
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
|
||||
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
|
||||
|
||||
@ -1840,6 +1859,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
|
||||
CREATE_NEW_MSG(msg_ref, COORD_MSG_TYPE_B_RECV,
|
||||
buf,
|
||||
count, datatype, tag, src, comm, NULL,
|
||||
ORTE_CELLID_INVALID,
|
||||
ORTE_JOBID_INVALID,
|
||||
ORTE_VPID_INVALID);
|
||||
|
||||
@ -1875,6 +1895,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
|
||||
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
|
||||
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
|
||||
|
||||
@ -1921,6 +1942,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
|
||||
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
|
||||
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
|
||||
|
||||
@ -2998,8 +3020,8 @@ static int ft_event_coordinate_peers(void)
|
||||
*/
|
||||
if( stall_for_completion ) {
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: ft_event_coordinate_peers: %s **** STALLING ***",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
"crcp:coord: ft_event_coordinate_peers: [%lu,%lu,%lu] **** STALLING ***",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
step_to_return_to = 1;
|
||||
exit_status = OMPI_SUCCESS;
|
||||
goto DONE;
|
||||
@ -3036,8 +3058,8 @@ static int ft_event_coordinate_peers(void)
|
||||
}
|
||||
|
||||
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: ft_event_coordinate_peers: %s Coordination Finished...\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name) );
|
||||
"crcp:coord: ft_event_coordinate_peers: [%lu,%lu,%lu] Coordination Finished...\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name) );
|
||||
|
||||
/*
|
||||
* Now that all our peer channels are marked as drained
|
||||
@ -3154,11 +3176,11 @@ static int ft_event_check_bookmarks(void)
|
||||
if( 10 <= mca_crcp_coord_component.super.verbose ) {
|
||||
sleep(orte_process_info.my_name->vpid);
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"Process %s Match Table",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
"Process [%lu,%lu,%lu] Match Table",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"%s %5s | %7s | %7s | %7s | %7s |",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"[%lu,%lu,%lu] %5s | %7s | %7s | %7s | %7s |",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
"Vpid", "T_Send", "M_Recv", "M_Send", "T_Recv");
|
||||
|
||||
for(item = opal_list_get_first(&ompi_crcp_coord_pml_peer_refs);
|
||||
@ -3183,8 +3205,8 @@ static int ft_event_check_bookmarks(void)
|
||||
peer_ref->matched_recv_init_msgs );
|
||||
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"%s %5d | %7d | %7d | %7d | %7d |",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"[%lu,%lu,%lu] %5d | %7d | %7d | %7d | %7d |",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
peer_ref->proc_name.vpid,
|
||||
t_send, m_recv, m_send, t_recv);
|
||||
}
|
||||
@ -3223,11 +3245,11 @@ static int ft_event_check_bookmarks(void)
|
||||
/* T_Send >= M_Recv */
|
||||
if( p_n_to_p_m < p_n_from_p_m ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s --> %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
|
||||
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
|
||||
" WARNING: Peer received more than was sent. :(\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3238,10 +3260,10 @@ static int ft_event_check_bookmarks(void)
|
||||
* so need to coordinate with peer. */
|
||||
if( p_n_to_p_m > p_n_from_p_m) {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s --> %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
|
||||
"Sent Msgs (%4d) = Received Msgs (%4d). Peer needs %4d.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3253,8 +3275,8 @@ static int ft_event_check_bookmarks(void)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = send_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: Unable to send message details to peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_ref->proc_name),
|
||||
"crcp:coord: check_bookmarks: Unable to send message details to peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_ref->proc_name),
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
@ -3274,11 +3296,11 @@ static int ft_event_check_bookmarks(void)
|
||||
/* M_Send >= T_Recv */
|
||||
if( p_n_to_p_m < p_n_from_p_m ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s --> %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
|
||||
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
|
||||
" WARNING: I received more than the peer sent. :(\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3289,10 +3311,10 @@ static int ft_event_check_bookmarks(void)
|
||||
* so need to coordinate with peer. */
|
||||
if( p_n_to_p_m > p_n_from_p_m) {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s <-- %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
|
||||
"Received Msgs (%4d) = Sent Msgs (%4d). I need %4d.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3303,8 +3325,8 @@ static int ft_event_check_bookmarks(void)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = recv_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: Unable to recv message details from peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_ref->proc_name),
|
||||
"crcp:coord: check_bookmarks: Unable to recv message details from peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_ref->proc_name),
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
@ -3326,11 +3348,11 @@ static int ft_event_check_bookmarks(void)
|
||||
/* M_Send >= T_Recv */
|
||||
if( p_n_to_p_m < p_n_from_p_m ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s --> %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
|
||||
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
|
||||
" WARNING: I received more than the peer sent. :(\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3341,10 +3363,10 @@ static int ft_event_check_bookmarks(void)
|
||||
* so need to coordinate with peer. */
|
||||
if( p_n_to_p_m > p_n_from_p_m) {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s <-- %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
|
||||
"Received Msgs (%4d) = Sent Msgs (%4d). I need %4d.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3355,8 +3377,8 @@ static int ft_event_check_bookmarks(void)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = recv_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: Unable to recv message details from peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_ref->proc_name),
|
||||
"crcp:coord: check_bookmarks: Unable to recv message details from peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_ref->proc_name),
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
@ -3376,11 +3398,11 @@ static int ft_event_check_bookmarks(void)
|
||||
/* T_Send >= M_Recv */
|
||||
if( p_n_to_p_m < p_n_from_p_m ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s --> %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
|
||||
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
|
||||
" WARNING: Peer received more than was sent. :(\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3391,10 +3413,10 @@ static int ft_event_check_bookmarks(void)
|
||||
* so need to coordinate with peer. */
|
||||
if( p_n_to_p_m > p_n_from_p_m) {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: %s --> %s "
|
||||
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
|
||||
"Sent Msgs (%4d) = Received Msgs (%4d). Peer needs %4d.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
p_n_to_p_m,
|
||||
p_n_from_p_m,
|
||||
(p_n_to_p_m - p_n_from_p_m)
|
||||
@ -3406,8 +3428,8 @@ static int ft_event_check_bookmarks(void)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = send_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: check_bookmarks: Unable to send message details to peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_ref->proc_name),
|
||||
"crcp:coord: check_bookmarks: Unable to send message details to peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_ref->proc_name),
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
@ -3440,8 +3462,8 @@ static int ft_event_post_drain_acks(void)
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: post_drain_ack: %s Wait on %d Drain ACK Messages.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"crcp:coord: post_drain_ack: [%lu,%lu,%lu] Wait on %d Drain ACK Messages.\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
(int)req_size);
|
||||
|
||||
/*
|
||||
@ -3460,8 +3482,8 @@ static int ft_event_post_drain_acks(void)
|
||||
drain_message_ack_cbfunc,
|
||||
NULL) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: post_drain_acks: %s Failed to post a RML receive to the peer\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
"crcp:coord: post_drain_acks: [%lu,%lu,%lu] Failed to post a RML receive to the peer\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
@ -3496,23 +3518,24 @@ static void drain_message_ack_cbfunc(int status,
|
||||
/* If this ACK has not completed yet */
|
||||
if(!drain_msg_ack->complete) {
|
||||
/* If it is the correct peer */
|
||||
if(drain_msg_ack->peer.jobid == sender->jobid &&
|
||||
if(drain_msg_ack->peer.cellid == sender->cellid &&
|
||||
drain_msg_ack->peer.jobid == sender->jobid &&
|
||||
drain_msg_ack->peer.vpid == sender->vpid ) {
|
||||
/* We found it! */
|
||||
drain_msg_ack->complete = true;
|
||||
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: drain_message_ack_cbfunc: %s --> %s Received ACK of FLUSH from peer\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(sender) );
|
||||
"crcp:coord: drain_message_ack_cbfunc: [%lu,%lu,%lu] --> [%lu,%lu,%lu] Received ACK of FLUSH from peer\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(sender) );
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: drain_message_ack_cbfunc: %s --> %s ERROR: Uable to match ACK to peer\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(sender) );
|
||||
"crcp:coord: drain_message_ack_cbfunc: [%lu,%lu,%lu] --> [%lu,%lu,%lu] ERROR: Uable to match ACK to peer\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(sender) );
|
||||
|
||||
cleanup:
|
||||
return;
|
||||
@ -3530,8 +3553,8 @@ static int ft_event_post_drained(void)
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: post_drained: %s Draining %d Messages.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"crcp:coord: post_drained: [%lu,%lu,%lu] Draining %d Messages.\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
(int)req_size);
|
||||
|
||||
/*
|
||||
@ -3551,8 +3574,8 @@ static int ft_event_post_drained(void)
|
||||
*/
|
||||
if( drain_msg->already_posted ) {
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: post_drained: %s Found a message that we don't need to post.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
"crcp:coord: post_drained: [%lu,%lu,%lu] Found a message that we don't need to post.\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
@ -3560,8 +3583,8 @@ static int ft_event_post_drained(void)
|
||||
*/
|
||||
else {
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: post_drained: %s Posting a message to be drained from %d.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"crcp:coord: post_drained: [%lu,%lu,%lu] Posting a message to be drained from %d.\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
drain_msg->rank);
|
||||
if( OMPI_SUCCESS != (ret = wrapped_pml_module->pml_irecv(drain_msg->buffer,
|
||||
(drain_msg->count * drain_msg->ddt_size),
|
||||
@ -3571,8 +3594,8 @@ static int ft_event_post_drained(void)
|
||||
drain_msg->comm,
|
||||
&(drain_msg->request) ) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: post_drained: %s Failed to post the Draining PML iRecv\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name) );
|
||||
"crcp:coord: post_drained: [%lu,%lu,%lu] Failed to post the Draining PML iRecv\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name) );
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
@ -3591,8 +3614,8 @@ static int ft_event_wait_quiesce(void)
|
||||
**********************************************/
|
||||
if( OMPI_SUCCESS != (ret = wait_quiesce_drained() ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce: %s Failed to quiesce drained messages\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name) );
|
||||
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] Failed to quiesce drained messages\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name) );
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -3602,8 +3625,8 @@ static int ft_event_wait_quiesce(void)
|
||||
*******************************************************************/
|
||||
if( OMPI_SUCCESS != (ret = wait_quiesce_drain_ack() ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce: %s Failed to recv all drain ACKs\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name) );
|
||||
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] Failed to recv all drain ACKs\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name) );
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -3635,8 +3658,8 @@ static int wait_quiesce_drained(void)
|
||||
}
|
||||
|
||||
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce_drained: %s Waiting on %d messages to drain\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"crcp:coord: wait_quiesce_drained: [%lu,%lu,%lu] Waiting on %d messages to drain\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
(int)req_size);
|
||||
|
||||
/*
|
||||
@ -3667,6 +3690,7 @@ static int wait_quiesce_drained(void)
|
||||
wait_any_requests[i] = &ompi_request_null;
|
||||
wait_any_status[i] = &ompi_status_empty;
|
||||
|
||||
proc_names[i].cellid = ORTE_CELLID_INVALID;
|
||||
proc_names[i].jobid = ORTE_JOBID_INVALID;
|
||||
proc_names[i].vpid = ORTE_VPID_INVALID;
|
||||
}
|
||||
@ -3690,15 +3714,15 @@ static int wait_quiesce_drained(void)
|
||||
*/
|
||||
if( drain_msg->already_posted && NULL == drain_msg->request) {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce_drained: %s - %s Already posted this msg.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(drain_msg->proc_name)) );
|
||||
"crcp:coord: wait_quiesce_drained: [%lu,%lu,%lu] - [%lu,%lu,%lu] Already posted this msg.\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(drain_msg->proc_name)) );
|
||||
}
|
||||
else {
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce_drained: %s - %s Waiting on message. (index = %d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(drain_msg->proc_name)),
|
||||
"crcp:coord: wait_quiesce_drained: [%lu,%lu,%lu] - [%lu,%lu,%lu] Waiting on message. (index = %d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(drain_msg->proc_name)),
|
||||
(int)wait_any_count);
|
||||
|
||||
wait_any_requests[wait_any_count] = drain_msg->request;
|
||||
@ -3712,7 +3736,8 @@ static int wait_quiesce_drained(void)
|
||||
/* Add proc to response queue if it is not already there */
|
||||
found = false;
|
||||
for(i = 0; i < last_proc_idx; ++i) {
|
||||
if(proc_names[i].jobid == drain_msg->proc_name.jobid &&
|
||||
if(proc_names[i].cellid == drain_msg->proc_name.cellid &&
|
||||
proc_names[i].jobid == drain_msg->proc_name.jobid &&
|
||||
proc_names[i].vpid == drain_msg->proc_name.vpid ) {
|
||||
found = true;
|
||||
break;
|
||||
@ -3720,11 +3745,12 @@ static int wait_quiesce_drained(void)
|
||||
}
|
||||
if( !found ) {
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce: %s - %s Add process to response list [idx %d]\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(drain_msg->proc_name)),
|
||||
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] - [%lu,%lu,%lu] Add process to response list [idx %d]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(drain_msg->proc_name)),
|
||||
(int)last_proc_idx);
|
||||
|
||||
proc_names[last_proc_idx].cellid = drain_msg->proc_name.cellid;
|
||||
proc_names[last_proc_idx].jobid = drain_msg->proc_name.jobid;
|
||||
proc_names[last_proc_idx].vpid = drain_msg->proc_name.vpid;
|
||||
last_proc_idx++;
|
||||
@ -3748,8 +3774,8 @@ static int wait_quiesce_drained(void)
|
||||
* Send ACKs to all peers
|
||||
*/
|
||||
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce: %s Send ACKs to all Peers\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] Send ACKs to all Peers\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
|
||||
for(i = 0; i < last_proc_idx; ++i) {
|
||||
orte_buffer_t *buffer = NULL;
|
||||
@ -3828,8 +3854,8 @@ static int coord_request_wait_all( size_t count,
|
||||
coord_request_wait(req, status);
|
||||
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: request_wait_all: %s Done with idx %d of %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"crcp:coord: request_wait_all: [%lu,%lu,%lu] Done with idx %d of %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
(int)i, (int)count);
|
||||
}
|
||||
|
||||
@ -3871,8 +3897,8 @@ static int wait_quiesce_drain_ack(void)
|
||||
}
|
||||
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: wait_quiesce_drain_ack: %s Waiting on %d Drain ACK messages\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
"crcp:coord: wait_quiesce_drain_ack: [%lu,%lu,%lu] Waiting on %d Drain ACK messages\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
num_outstanding);
|
||||
|
||||
while(0 < num_outstanding) {
|
||||
@ -3913,6 +3939,7 @@ static int send_bookmarks(int peer_idx)
|
||||
/*
|
||||
* Find the peer structure for this peer
|
||||
*/
|
||||
peer_name.cellid = orte_process_info.my_name->cellid;
|
||||
peer_name.jobid = orte_process_info.my_name->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
|
||||
@ -3925,9 +3952,9 @@ static int send_bookmarks(int peer_idx)
|
||||
}
|
||||
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: send_bookmarks: %s -> %s Sending bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&peer_name),
|
||||
"crcp:coord: send_bookmarks: [%lu,%lu,%lu] -> [%lu,%lu,%lu] Sending bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&peer_name),
|
||||
peer_ref->total_send_msgs,
|
||||
peer_ref->total_isend_msgs,
|
||||
peer_ref->total_send_init_msgs,
|
||||
@ -3959,8 +3986,8 @@ static int send_bookmarks(int peer_idx)
|
||||
|
||||
if ( 0 > ( ret = orte_rml.send_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: send_bookmarks: Failed to send bookmark to peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_name),
|
||||
"crcp:coord: send_bookmarks: Failed to send bookmark to peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_name),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -3986,6 +4013,7 @@ static int recv_bookmarks(int peer_idx)
|
||||
/*
|
||||
* Find the peer structure for this peer
|
||||
*/
|
||||
peer_name.cellid = orte_process_info.my_name->cellid;
|
||||
peer_name.jobid = orte_process_info.my_name->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
|
||||
@ -4007,8 +4035,8 @@ static int recv_bookmarks(int peer_idx)
|
||||
|
||||
if ( 0 > (ret = orte_rml.recv_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_name),
|
||||
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_name),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -4035,9 +4063,9 @@ static int recv_bookmarks(int peer_idx)
|
||||
peer_ref->matched_recv_init_msgs = tmp_int;
|
||||
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_bookmarks: %s <- %s Received bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&peer_name),
|
||||
"crcp:coord: recv_bookmarks: [%lu,%lu,%lu] <- [%lu,%lu,%lu] Received bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&peer_name),
|
||||
peer_ref->matched_send_msgs,
|
||||
peer_ref->matched_isend_msgs,
|
||||
peer_ref->matched_send_init_msgs,
|
||||
@ -4091,9 +4119,9 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
found_match = false;
|
||||
if(OMPI_SUCCESS != (ret = do_send_msg_detail(peer_ref, msg_ref, &found_match, &finished)) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: send_msg_details: %s --> %s Failed to send message details to peer. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
"crcp:coord: send_msg_details: [%lu,%lu,%lu] --> [%lu,%lu,%lu] Failed to send message details to peer. Return %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
}
|
||||
if(found_match) {
|
||||
@ -4137,14 +4165,15 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
* inflight messages into a local buffer
|
||||
*/
|
||||
d_msg_ack = OBJ_NEW(drain_msg_ack_ref_t);
|
||||
d_msg_ack->peer.cellid = peer_ref->proc_name.cellid;
|
||||
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
|
||||
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
|
||||
d_msg_ack->complete = false;
|
||||
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: send_msg_details: %s <--> %s Will wait on ACK from this peer.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)));
|
||||
"crcp:coord: send_msg_details: [%lu,%lu,%lu] <--> [%lu,%lu,%lu] Will wait on ACK from this peer.\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)));
|
||||
|
||||
/*
|
||||
* If we know that we are in the middle of a blocking send/recv then we
|
||||
@ -4214,8 +4243,8 @@ static int do_send_msg_detail(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
if ( 0 > ( ret = orte_rml.send_buffer(&peer_ref->proc_name, buffer,
|
||||
OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: do_send_msg_detail: Unable to send message details to peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_ref->proc_name),
|
||||
"crcp:coord: do_send_msg_detail: Unable to send message details to peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_ref->proc_name),
|
||||
ret);
|
||||
|
||||
exit_status = OMPI_ERROR;
|
||||
@ -4241,9 +4270,9 @@ static int do_send_msg_detail(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
if ( 0 > (ret = orte_rml.recv_buffer(&peer_ref->proc_name, buffer,
|
||||
OMPI_CRCP_COORD_BOOKMARK_TAG) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: do_send_msg_detail: %s --> %s Failed to receive ACK buffer from peer. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
"crcp:coord: do_send_msg_detail: [%lu,%lu,%lu] --> [%lu,%lu,%lu] Failed to receive ACK buffer from peer. Return %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -4312,10 +4341,10 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
&p_tag, &p_count,
|
||||
&p_datatype_size)) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_details: %s <-- %s "
|
||||
"crcp:coord: recv_msg_details: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
|
||||
"Failed to receive message detail from peer. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -4332,10 +4361,10 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
p_datatype_size,
|
||||
&found_match) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_details: %s <-- %s "
|
||||
"crcp:coord: recv_msg_details: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
|
||||
"Failed to check message detail from peer. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -4360,9 +4389,9 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
|
||||
if(OMPI_SUCCESS != (ret = do_recv_msg_detail_resp(peer_ref, response))) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_details: %s <-- %s Failed to respond to peer. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
"crcp:coord: recv_msg_details: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] Failed to respond to peer. Return %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -4392,9 +4421,9 @@ static int do_recv_msg_detail(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
*/
|
||||
if ( 0 > (ret = orte_rml.recv_buffer(&peer_ref->proc_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: do_recv_msg_detail: %s <-- %s Failed to receive buffer from peer. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
"crcp:coord: do_recv_msg_detail: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] Failed to receive buffer from peer. Return %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -4453,20 +4482,20 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
&msg_already_posted); /* Has the recv already been posted? */
|
||||
if( OMPI_SUCCESS != ret) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: %s -- %s "
|
||||
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] -- [%lu,%lu,%lu] "
|
||||
"Failed to determine if we have received this message. Return %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(20, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: %s -- %s"
|
||||
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] -- [%lu,%lu,%lu]"
|
||||
" found %s, complete %s, posted %s, peer_rank=[%d vs %d]\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
|
||||
(true == msg_found ? "True " : "False"),
|
||||
(true == msg_complete ? "True " : "False"),
|
||||
(true == msg_already_posted ? "True " : "False"),
|
||||
@ -4482,8 +4511,8 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
ompi_crcp_coord_pml_message_ref_t *d_msg = NULL;
|
||||
|
||||
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: %s Found a message that needs to be drained\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name) );
|
||||
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] Found a message that needs to be drained\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name) );
|
||||
|
||||
/*
|
||||
* Construct a message for draining
|
||||
@ -4493,6 +4522,7 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
0, NULL, /* Setup the datatype outside of this */
|
||||
tag, rank, ompi_comm_lookup(comm_id),
|
||||
NULL,
|
||||
peer_ref->proc_name.cellid,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
/*
|
||||
@ -4540,9 +4570,9 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
ompi_crcp_coord_pml_message_ref_t *d_msg = NULL;
|
||||
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: %s "
|
||||
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] "
|
||||
"Found a message already posted! Prepare to drain.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
|
||||
/*
|
||||
* If this is the current blocking recv,
|
||||
@ -4551,9 +4581,9 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
if( current_msg_id == posted_msg_ref->msg_id &&
|
||||
COORD_MSG_TYPE_B_RECV == posted_msg_ref->msg_type) {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: %s "
|
||||
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] "
|
||||
"Found a message already posted! Prepare to STALL.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
stall_for_completion = true;
|
||||
}
|
||||
/*
|
||||
@ -4562,9 +4592,9 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
*/
|
||||
else {
|
||||
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: %s "
|
||||
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] "
|
||||
"Found a message already posted! No stall required [%3d, %3d, %3d, %3d].\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
(int)current_msg_id,
|
||||
(int)current_msg_type,
|
||||
(int)posted_msg_ref->msg_id,
|
||||
@ -4596,6 +4626,7 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
count, NULL,
|
||||
tag, rank, ompi_comm_lookup(comm_id),
|
||||
posted_msg_ref->request,
|
||||
peer_ref->proc_name.cellid,
|
||||
peer_ref->proc_name.jobid,
|
||||
peer_ref->proc_name.vpid);
|
||||
|
||||
@ -4611,8 +4642,8 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
}
|
||||
else {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_check: ***** ERROR ***** %s Failed to find an action to use. This should never happen!\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
"crcp:coord: recv_msg_detail_check: ***** ERROR ***** [%lu,%lu,%lu] Failed to find an action to use. This should never happen!\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
exit_status = OMPI_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
@ -4898,8 +4929,8 @@ static int do_recv_msg_detail_resp(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
|
||||
|
||||
if ( 0 > ( ret = orte_rml.send_buffer(&peer_ref->proc_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_msg_detail_resp: Unable to send message detail response to peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_ref->proc_name),
|
||||
"crcp:coord: recv_msg_detail_resp: Unable to send message detail response to peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_ref->proc_name),
|
||||
ret);
|
||||
exit_status = OMPI_ERROR;
|
||||
goto cleanup;
|
||||
@ -4957,6 +4988,7 @@ static int coord_basic_barrier_send(int peer_idx)
|
||||
/*
|
||||
* Find the peer structure for this peer
|
||||
*/
|
||||
peer_name.cellid = orte_process_info.my_name->cellid;
|
||||
peer_name.jobid = orte_process_info.my_name->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
|
||||
@ -4974,8 +5006,8 @@ static int coord_basic_barrier_send(int peer_idx)
|
||||
/* JJH -- Really Establish TAG in rml_types.h */
|
||||
if ( 0 > ( ret = orte_rml.send_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG+1, 0)) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: coord_basic_barrier_send: Failed to send ACK to peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_name),
|
||||
"crcp:coord: coord_basic_barrier_send: Failed to send ACK to peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_name),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -5001,6 +5033,7 @@ static int coord_basic_barrier_recv(int peer_idx)
|
||||
/*
|
||||
* Find the peer structure for this peer
|
||||
*/
|
||||
peer_name.cellid = orte_process_info.my_name->cellid;
|
||||
peer_name.jobid = orte_process_info.my_name->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
|
||||
@ -5014,8 +5047,8 @@ static int coord_basic_barrier_recv(int peer_idx)
|
||||
|
||||
if ( 0 > (ret = orte_rml.recv_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG+1) ) ) {
|
||||
opal_output(mca_crcp_coord_component.super.output_handle,
|
||||
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer %s: Return %d\n",
|
||||
ORTE_NAME_PRINT(&peer_name),
|
||||
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer [%lu,%lu,%lu]: Return %d\n",
|
||||
ORTE_NAME_ARGS(&peer_name),
|
||||
ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
|
@ -400,9 +400,9 @@ int mca_mpool_rdma_release_memory(struct mca_mpool_base_module_t *mpool,
|
||||
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
|
||||
{
|
||||
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
|
||||
opal_output(0, "%s rdma: stats "
|
||||
opal_output(0, "[%lu,%lu,%lu] rdma: stats "
|
||||
"(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss,
|
||||
mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound,
|
||||
mpool_rdma->stat_evicted);
|
||||
|
@ -318,14 +318,14 @@ mca_pml_base_pml_check_selected(const char *my_pml,
|
||||
if ((size != strlen(my_pml) + 1) ||
|
||||
(0 != strcmp(my_pml, remote_pml))) {
|
||||
if (procs[i]->proc_hostname) {
|
||||
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
||||
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name),
|
||||
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] on %s selected pml %s",
|
||||
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
|
||||
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
|
||||
procs[i]->proc_hostname, remote_pml);
|
||||
} else {
|
||||
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
|
||||
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name),
|
||||
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] selected pml %s",
|
||||
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
|
||||
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
|
||||
remote_pml);
|
||||
}
|
||||
return OMPI_ERR_UNREACH;
|
||||
|
@ -292,7 +292,7 @@ ompi_proc_t * ompi_proc_find ( const orte_process_name_t * name )
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* return the proc-struct which matches this jobid+process id */
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
||||
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
||||
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
||||
@ -315,7 +315,7 @@ ompi_proc_find_and_add(const orte_process_name_t * name, bool* isnew)
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/* return the proc-struct which matches this jobid+process id */
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
||||
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
|
||||
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
|
||||
@ -550,7 +550,7 @@ static void callback(orte_gpr_notify_data_t *data, void *cbdata)
|
||||
OPAL_THREAD_LOCK(&ompi_proc_lock);
|
||||
|
||||
/* loop over the data returned in the subscription */
|
||||
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
|
||||
value = (orte_gpr_value_t**)(data->values)->addr;
|
||||
for (i = 0, k=0; k < data->cnt &&
|
||||
i < (data->values)->size; ++i) {
|
||||
|
@ -806,8 +806,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
ompi_mpi_initialized = true;
|
||||
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "%s ompi_mpi_init completed",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] ompi_mpi_init completed",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
/* Do we need to wait for a TotalView-like debugger? */
|
||||
|
@ -46,7 +46,7 @@ static OBJ_CLASS_INSTANCE(
|
||||
NULL);
|
||||
|
||||
#define GET_KEY(proc) \
|
||||
( (((uint32_t) proc->jobid) << 24) + ((uint32_t) proc->vpid) )
|
||||
( (((uint32_t) proc->cellid) << 24) + (((uint32_t) proc->jobid) << 16) + ((uint32_t) proc->vpid) )
|
||||
|
||||
void* orte_hash_table_get_proc(opal_hash_table_t* ht,
|
||||
const orte_process_name_t* proc)
|
||||
|
@ -92,44 +92,48 @@ typedef void* orte_iov_base_ptr_t;
|
||||
#define ORTE_NAME (orte_data_type_t) 22 /**< an orte_process_name_t */
|
||||
#define ORTE_VPID (orte_data_type_t) 23 /**< a vpid */
|
||||
#define ORTE_JOBID (orte_data_type_t) 24 /**< a jobid */
|
||||
#define ORTE_NODEID (orte_data_type_t) 25 /**< a node id */
|
||||
#define ORTE_PSET (orte_data_type_t) 25 /**< a process set */
|
||||
#define ORTE_CELLID (orte_data_type_t) 26 /**< a cellid */
|
||||
#define ORTE_NODEID (orte_data_type_t) 27 /**< a node id */
|
||||
/* SMR types */
|
||||
#define ORTE_NODE_STATE (orte_data_type_t) 26 /**< node status flag */
|
||||
#define ORTE_PROC_STATE (orte_data_type_t) 27 /**< process/resource status */
|
||||
#define ORTE_JOB_STATE (orte_data_type_t) 28 /**< job status flag */
|
||||
#define ORTE_EXIT_CODE (orte_data_type_t) 29 /**< process exit code */
|
||||
#define ORTE_NODE_STATE (orte_data_type_t) 28 /**< node status flag */
|
||||
#define ORTE_PROC_STATE (orte_data_type_t) 29 /**< process/resource status */
|
||||
#define ORTE_PSET_STATE (orte_data_type_t) 30 /**< process set state */
|
||||
#define ORTE_JOB_STATE (orte_data_type_t) 31 /**< job status flag */
|
||||
#define ORTE_EXIT_CODE (orte_data_type_t) 32 /**< process exit code */
|
||||
/* GPR types */
|
||||
#define ORTE_GPR_KEYVAL (orte_data_type_t) 30 /**< registry key-value pair */
|
||||
#define ORTE_GPR_NOTIFY_ACTION (orte_data_type_t) 31 /**< registry notify action */
|
||||
#define ORTE_GPR_TRIGGER_ACTION (orte_data_type_t) 32 /**< registry trigger action */
|
||||
#define ORTE_GPR_CMD (orte_data_type_t) 33 /**< registry command */
|
||||
#define ORTE_GPR_SUBSCRIPTION_ID (orte_data_type_t) 34 /**< registry notify id tag */
|
||||
#define ORTE_GPR_TRIGGER_ID (orte_data_type_t) 35 /**< registry notify id tag */
|
||||
#define ORTE_GPR_VALUE (orte_data_type_t) 36 /**< registry return value */
|
||||
#define ORTE_GPR_ADDR_MODE (orte_data_type_t) 37 /**< Addressing mode for registry cmds */
|
||||
#define ORTE_GPR_SUBSCRIPTION (orte_data_type_t) 38 /**< describes data returned by subscription */
|
||||
#define ORTE_GPR_TRIGGER (orte_data_type_t) 39 /**< describes trigger conditions */
|
||||
#define ORTE_GPR_NOTIFY_DATA (orte_data_type_t) 40 /**< data returned from a subscription */
|
||||
#define ORTE_GPR_NOTIFY_MSG (orte_data_type_t) 41 /**< notify message containing notify_data objects */
|
||||
#define ORTE_GPR_NOTIFY_MSG_TYPE (orte_data_type_t) 42 /**< notify message type (subscription or trigger) */
|
||||
#define ORTE_GPR_SEARCH (orte_data_type_t) 43 /**< search criteria */
|
||||
#define ORTE_GPR_UPDATE (orte_data_type_t) 44 /**< update data on the registry */
|
||||
#define ORTE_GPR_KEYVAL (orte_data_type_t) 33 /**< registry key-value pair */
|
||||
#define ORTE_GPR_NOTIFY_ACTION (orte_data_type_t) 34 /**< registry notify action */
|
||||
#define ORTE_GPR_TRIGGER_ACTION (orte_data_type_t) 35 /**< registry trigger action */
|
||||
#define ORTE_GPR_CMD (orte_data_type_t) 36 /**< registry command */
|
||||
#define ORTE_GPR_SUBSCRIPTION_ID (orte_data_type_t) 37 /**< registry notify id tag */
|
||||
#define ORTE_GPR_TRIGGER_ID (orte_data_type_t) 38 /**< registry notify id tag */
|
||||
#define ORTE_GPR_VALUE (orte_data_type_t) 39 /**< registry return value */
|
||||
#define ORTE_GPR_ADDR_MODE (orte_data_type_t) 40 /**< Addressing mode for registry cmds */
|
||||
#define ORTE_GPR_SUBSCRIPTION (orte_data_type_t) 41 /**< describes data returned by subscription */
|
||||
#define ORTE_GPR_TRIGGER (orte_data_type_t) 42 /**< describes trigger conditions */
|
||||
#define ORTE_GPR_NOTIFY_DATA (orte_data_type_t) 43 /**< data returned from a subscription */
|
||||
#define ORTE_GPR_NOTIFY_MSG (orte_data_type_t) 44 /**< notify message containing notify_data objects */
|
||||
#define ORTE_GPR_NOTIFY_MSG_TYPE (orte_data_type_t) 45 /**< notify message type (subscription or trigger) */
|
||||
#define ORTE_GPR_SEARCH (orte_data_type_t) 46 /**< search criteria */
|
||||
#define ORTE_GPR_UPDATE (orte_data_type_t) 47 /**< update data on the registry */
|
||||
/* Resource Manager types */
|
||||
#define ORTE_APP_CONTEXT (orte_data_type_t) 45 /**< argv and enviro arrays */
|
||||
#define ORTE_APP_CONTEXT_MAP (orte_data_type_t) 46 /**< application context mapping array */
|
||||
#define ORTE_NODE_DESC (orte_data_type_t) 47 /**< describes capabilities of nodes */
|
||||
#define ORTE_SLOT_DESC (orte_data_type_t) 48 /**< describes slot allocations/reservations */
|
||||
#define ORTE_RAS_NODE (orte_data_type_t) 49 /**< node information */
|
||||
#define ORTE_JOB_MAP (orte_data_type_t) 50 /**< map of process locations */
|
||||
#define ORTE_MAPPED_PROC (orte_data_type_t) 51 /**< process entry on map */
|
||||
#define ORTE_MAPPED_NODE (orte_data_type_t) 52 /**< node entry on map */
|
||||
#define ORTE_ATTRIBUTE (orte_data_type_t) 53 /**< attribute used to control framework behavior */
|
||||
#define ORTE_ATTR_LIST (orte_data_type_t) 54 /**< list of attributes */
|
||||
#define ORTE_APP_CONTEXT (orte_data_type_t) 48 /**< argv and enviro arrays */
|
||||
#define ORTE_APP_CONTEXT_MAP (orte_data_type_t) 49 /**< application context mapping array */
|
||||
#define ORTE_NODE_DESC (orte_data_type_t) 50 /**< describes capabilities of nodes */
|
||||
#define ORTE_CELL_DESC (orte_data_type_t) 51 /**< describe attributes of cells */
|
||||
#define ORTE_SLOT_DESC (orte_data_type_t) 52 /**< describes slot allocations/reservations */
|
||||
#define ORTE_RAS_NODE (orte_data_type_t) 53 /**< node information */
|
||||
#define ORTE_JOB_MAP (orte_data_type_t) 54 /**< map of process locations */
|
||||
#define ORTE_MAPPED_PROC (orte_data_type_t) 55 /**< process entry on map */
|
||||
#define ORTE_MAPPED_NODE (orte_data_type_t) 56 /**< node entry on map */
|
||||
#define ORTE_ATTRIBUTE (orte_data_type_t) 57 /**< attribute used to control framework behavior */
|
||||
#define ORTE_ATTR_LIST (orte_data_type_t) 58 /**< list of attributes */
|
||||
/* RML types */
|
||||
#define ORTE_RML_TAG (orte_data_type_t) 55 /**< tag for sending/receiving messages */
|
||||
#define ORTE_RML_TAG (orte_data_type_t) 59 /**< tag for sending/receiving messages */
|
||||
|
||||
/* DAEMON communication type */
|
||||
#define ORTE_DAEMON_CMD (orte_data_type_t) 56 /**< command flag for communicating with the daemon */
|
||||
#define ORTE_DAEMON_CMD (orte_data_type_t) 60 /**< command flag for communicating with the daemon */
|
||||
|
||||
/* Need a command separate from ORTE_DAEMON_CMD, so that we can receive on
|
||||
* them both at the same time */
|
||||
|
@ -42,9 +42,14 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
if (NULL == orte_process_info.my_name) {
|
||||
opal_output(0, "[NO-NAME] ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
} else {
|
||||
opal_output(0, "[%lu,%lu,%lu] ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_ERROR_NAME(error_code), filename, line);
|
||||
}
|
||||
}
|
||||
|
||||
int orte_errmgr_base_proc_aborted_not_avail(orte_gpr_notify_message_t *msg)
|
||||
|
@ -154,8 +154,8 @@ orte_errmgr_bproc_component_init(bool *allow_multi_user_threads, bool *have_hidd
|
||||
int orte_errmgr_bproc_finalize(void)
|
||||
{
|
||||
if (orte_errmgr_bproc_globals.debug) {
|
||||
opal_output(0, "%s errmgr_bproc_finalize called",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_bproc_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
@ -159,8 +159,8 @@ int orte_errmgr_hnp_finalize(void)
|
||||
int rc;
|
||||
|
||||
if (orte_errmgr_hnp_globals.debug) {
|
||||
opal_output(0, "%s errmgr_hnp_finalize called",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_hnp_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
/* stop the receive function */
|
||||
|
@ -154,8 +154,8 @@ orte_errmgr_orted_component_init(bool *allow_multi_user_threads, bool *have_hidd
|
||||
int orte_errmgr_orted_finalize(void)
|
||||
{
|
||||
if (orte_errmgr_orted_globals.debug) {
|
||||
opal_output(0, "%s errmgr_orted_finalize called",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_orted_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
@ -153,8 +153,8 @@ orte_errmgr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidd
|
||||
int orte_errmgr_proxy_finalize(void)
|
||||
{
|
||||
if (orte_errmgr_proxy_globals.debug) {
|
||||
opal_output(0, "%s errmgr_proxy_finalize called",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] errmgr_proxy_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
initialized = false;
|
||||
|
@ -155,7 +155,8 @@ int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine
|
||||
* Contact GPR and get the 'orte-node-name' for this process
|
||||
*/
|
||||
/* if it is the root then we need a different key :/ */
|
||||
if(proc->jobid == 0 &&
|
||||
if(proc->cellid == 0 &&
|
||||
proc->jobid == 0 &&
|
||||
proc->vpid == 0) {
|
||||
keys[0] = ORTE_PROC_RML_IP_ADDRESS_KEY;
|
||||
}
|
||||
@ -360,9 +361,9 @@ void orte_filem_base_query_callback(int status,
|
||||
}
|
||||
|
||||
opal_output_verbose(10, orte_filem_base_output,
|
||||
"filem:base: filem_base_query_callback: %s -> %s: Filename Requested (%s) translated to (%s)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(peer),
|
||||
"filem:base: filem_base_query_callback: [%lu,%lu,%lu] -> [%lu,%lu,%lu]: Filename Requested (%s) translated to (%s)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(peer),
|
||||
filename, tmp_name);
|
||||
|
||||
/*
|
||||
|
@ -426,9 +426,9 @@ static void orte_filem_rsh_query_callback(int status,
|
||||
void* cbdata)
|
||||
{
|
||||
opal_output_verbose(10, mca_filem_rsh_component.super.output_handle,
|
||||
"filem:rsh: query_callback(%s -> %s)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(peer));
|
||||
"filem:rsh: query_callback([%lu,%lu,%lu] -> [%lu,%lu,%lu])",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(peer));
|
||||
|
||||
/* Call the base callback function */
|
||||
orte_filem_base_query_callback(status, peer, buffer, tag, cbdata);
|
||||
|
@ -229,8 +229,8 @@ orte_gpr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_
|
||||
if (NULL != orte_process_info.gpr_replica_uri) {
|
||||
|
||||
if (orte_gpr_proxy_globals.debug) {
|
||||
opal_output(0, "%s gpr_proxy_init: proxy selected",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_proxy_init: proxy selected",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
/* setup the replica location */
|
||||
@ -325,8 +325,8 @@ int orte_gpr_proxy_finalize(void)
|
||||
orte_gpr_proxy_trigger_t **ltrigs;
|
||||
|
||||
if (orte_gpr_proxy_globals.debug) {
|
||||
opal_output(0, "%s gpr_proxy_finalize called",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_proxy_finalize called",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
if (initialized) {
|
||||
|
@ -95,7 +95,7 @@ int orte_gpr_proxy_exec_compound_cmd(orte_buffer_t *buffer)
|
||||
int rc, response;
|
||||
|
||||
if (orte_gpr_proxy_globals.debug) {
|
||||
opal_output(0, "[%ld,%ld] transmitting compound command",
|
||||
opal_output(0, "[%lu,%lu,%lu] transmitting compound command",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
|
@ -47,7 +47,7 @@ int orte_gpr_proxy_dump_local_triggers(void)
|
||||
orte_gpr_proxy_trigger_t **trigs;
|
||||
orte_std_cntr_t j, k;
|
||||
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for [%ld,%ld]\n",
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(orte_gpr_base_output, "Number of triggers: %lu\n", (unsigned long) orte_gpr_proxy_globals.num_trigs);
|
||||
|
||||
@ -72,7 +72,7 @@ int orte_gpr_proxy_dump_local_subscriptions(void)
|
||||
orte_gpr_proxy_subscriber_t **subs;
|
||||
orte_std_cntr_t j, k;
|
||||
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for [%ld,%ld]\n",
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(orte_gpr_base_output, "Number of subscriptions: %lu\n", (unsigned long) orte_gpr_proxy_globals.num_subs);
|
||||
|
||||
|
@ -42,8 +42,8 @@ int orte_gpr_replica_dump_all(void)
|
||||
int rc;
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_dump_all: entered",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_all: entered",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
|
||||
@ -74,8 +74,8 @@ int orte_gpr_replica_dump_segments(char *segment)
|
||||
int rc;
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_dump_segments: entered",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_segments: entered",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
|
||||
@ -106,8 +106,8 @@ int orte_gpr_replica_dump_triggers(orte_gpr_trigger_id_t start)
|
||||
int rc;
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_dump_triggers: entered",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_triggers: entered",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
|
||||
@ -290,8 +290,8 @@ int orte_gpr_replica_dump_callbacks(void)
|
||||
int rc;
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_dump_callbacks: entered",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_callbacks: entered",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
|
||||
|
@ -40,8 +40,8 @@ int orte_gpr_replica_dump_local_triggers(void)
|
||||
orte_gpr_replica_local_trigger_t **trigs;
|
||||
orte_std_cntr_t j, k;
|
||||
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for %s\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(orte_gpr_base_output, "Number of triggers: %lu\n", (unsigned long) orte_gpr_replica_globals.num_local_trigs);
|
||||
|
||||
trigs = (orte_gpr_replica_local_trigger_t**)(orte_gpr_replica_globals.local_triggers)->addr;
|
||||
@ -70,8 +70,8 @@ int orte_gpr_replica_dump_local_subscriptions(void)
|
||||
orte_gpr_replica_local_subscriber_t **subs;
|
||||
orte_std_cntr_t j, k;
|
||||
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for %s\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(orte_gpr_base_output, "Number of subscriptions: %lu\n", (unsigned long) orte_gpr_replica_globals.num_local_subs);
|
||||
|
||||
subs = (orte_gpr_replica_local_subscriber_t**)(orte_gpr_replica_globals.local_subscriptions)->addr;
|
||||
|
@ -64,8 +64,8 @@ void orte_gpr_replica_recv(int status, orte_process_name_t* sender,
|
||||
OPAL_TRACE(3);
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr replica: received message from %s",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), ORTE_NAME_PRINT(sender));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr replica: received message from [%lu,%lu,%lu]",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), ORTE_NAME_ARGS(sender));
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
|
||||
|
@ -96,7 +96,7 @@ int orte_gpr_replica_remote_notify(orte_process_name_t *recipient,
|
||||
* process is done
|
||||
*/
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
opal_output(0, "send failed to %s", ORTE_NAME_PRINT(recipient));
|
||||
opal_output(0, "send failed to [%ld,%ld,%ld]", ORTE_NAME_ARGS(recipient));
|
||||
orte_dss.dump(0, message, ORTE_GPR_NOTIFY_MSG);
|
||||
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
|
@ -79,8 +79,8 @@ int orte_gpr_replica_cleanup_proc_fn(orte_process_name_t *proc)
|
||||
OPAL_TRACE(2);
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_cleanup_proc: function entered for process %s",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), ORTE_NAME_PRINT(proc));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_cleanup_proc: function entered for process [%lu,%lu,%lu]",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), ORTE_NAME_ARGS(proc));
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&procname, proc))) {
|
||||
|
@ -236,8 +236,8 @@ int orte_gpr_replica_dump_callbacks_fn(orte_buffer_t *buffer)
|
||||
if (NULL == cb->requestor) {
|
||||
sprintf(tmp_out, "Local requestor");
|
||||
} else {
|
||||
sprintf(tmp_out, "Requestor: %s",
|
||||
ORTE_NAME_PRINT(cb->requestor));
|
||||
sprintf(tmp_out, "Requestor: [%lu,%lu,%lu]",
|
||||
ORTE_NAME_ARGS(cb->requestor));
|
||||
}
|
||||
orte_gpr_replica_dump_load_string(buffer, &tmp_out);
|
||||
orte_gpr_base_dump_notify_msg(buffer, cb->message);
|
||||
@ -420,8 +420,8 @@ int orte_gpr_replica_dump_trigger(orte_buffer_t *buffer,
|
||||
sprintf(tmp_out, "\t\tRequestor %lu: LOCAL@idtag %lu",
|
||||
(unsigned long)j, (unsigned long)attached[i]->idtag);
|
||||
} else {
|
||||
sprintf(tmp_out, "\t\tRequestor %lu: %s@idtag %lu",
|
||||
(unsigned long)j, ORTE_NAME_PRINT(attached[i]->requestor),
|
||||
sprintf(tmp_out, "\t\tRequestor %lu: [%lu,%lu,%lu]@idtag %lu",
|
||||
(unsigned long)j, ORTE_NAME_ARGS(attached[i]->requestor),
|
||||
(unsigned long)attached[i]->idtag);
|
||||
}
|
||||
orte_gpr_replica_dump_load_string(buffer, &tmp_out);
|
||||
@ -435,8 +435,8 @@ int orte_gpr_replica_dump_trigger(orte_buffer_t *buffer,
|
||||
sprintf(tmp_out, "\tTRIGGER MASTER: LOCAL@idtag %lu",
|
||||
(unsigned long)trig->master->idtag);
|
||||
} else {
|
||||
sprintf(tmp_out, "\tTRIGGER MASTER: %s@idtag %lu",
|
||||
ORTE_NAME_PRINT(trig->master->requestor),
|
||||
sprintf(tmp_out, "\tTRIGGER MASTER: [%lu,%lu,%lu]@idtag %lu",
|
||||
ORTE_NAME_ARGS(trig->master->requestor),
|
||||
(unsigned long)trig->master->idtag);
|
||||
}
|
||||
}
|
||||
@ -612,8 +612,8 @@ int orte_gpr_replica_dump_subscription(orte_buffer_t *buffer,
|
||||
sprintf(tmp_out, "\t\tRequestor: LOCAL @ subscription id %lu",
|
||||
(unsigned long) reqs[j]->idtag);
|
||||
} else {
|
||||
sprintf(tmp_out, "\t\tRequestor: %s @ subscription id %lu",
|
||||
ORTE_NAME_PRINT(reqs[j]->requestor),
|
||||
sprintf(tmp_out, "\t\tRequestor: [%lu,%lu,%lu] @ subscription id %lu",
|
||||
ORTE_NAME_ARGS(reqs[j]->requestor),
|
||||
(unsigned long) reqs[j]->idtag);
|
||||
}
|
||||
orte_gpr_replica_dump_load_string(buffer, &tmp_out);
|
||||
|
@ -130,8 +130,8 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
char *tmp;
|
||||
|
||||
opal_output(0, "%s gpr_replica_put: entered on segment %s\nValues:",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), seg->name);
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_put: entered on segment %s\nValues:",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), seg->name);
|
||||
for (i=0; i < cnt; i++) {
|
||||
opal_output(0, "\tKey: %s", keyvals[i]->key);
|
||||
}
|
||||
@ -282,7 +282,7 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
|
||||
}
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_put: complete", ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_put: complete", ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -322,8 +322,8 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
char *token;
|
||||
opal_output(0, "%s gpr_replica_get: entered",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_get: entered",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
opal_output(0, "\tGetting data from segment %s with %d tokens and %d keys",
|
||||
seg->name, num_tokens, num_keys);
|
||||
for (i=0; i < num_tokens; i++) {
|
||||
@ -520,8 +520,8 @@ CLEANUP:
|
||||
OBJ_DESTRUCT(&get_list);
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_get: finished search",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_get: finished search",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -721,8 +721,8 @@ CLEANUP:
|
||||
OBJ_DESTRUCT(&get_list);
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_get: finished search",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_get: finished search",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -52,8 +52,8 @@ int orte_gpr_replica_subscribe_fn(orte_process_name_t *requestor,
|
||||
OPAL_TRACE(2);
|
||||
|
||||
if (orte_gpr_replica_globals.debug) {
|
||||
opal_output(0, "%s gpr_replica_subscribe: entered with num_trigs:%d",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), num_trigs);
|
||||
opal_output(0, "[%lu,%lu,%lu] gpr_replica_subscribe: entered with num_trigs:%d",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), num_trigs);
|
||||
}
|
||||
|
||||
/* ensure one of the search arrays is clear - in this case, we
|
||||
|
@ -74,7 +74,7 @@ in the job. Note that the jobid=1 and the mask=2. So, we expect this
|
||||
to collect the stdout from any of the ranks. Obviously the second
|
||||
subscriber says the same thing but for stderr. The third subscriber
|
||||
is for receving data from stdin and sending it out to rank 0 of
|
||||
the job. Notice the mask=ff which means compare jobid,vpid
|
||||
the job. Notice the mask=ff which means compare cellid,jobid,vpid
|
||||
when addressing where the data goes.
|
||||
|
||||
The first endpoint is created by a call to pull by the rmgr. After
|
||||
@ -90,6 +90,7 @@ tied to the subscription. Hmmm, this I do not really understand.
|
||||
APPENDIX A
|
||||
These are the defines that go with the mask.
|
||||
#define ORTE_NS_CMP_NONE 0x00
|
||||
#define ORTE_NS_CMP_CELLID 0x01
|
||||
#define ORTE_NS_CMP_JOBID 0x02
|
||||
#define ORTE_NS_CMP_VPID 0x04
|
||||
#define ORTE_NS_CMP_ALL 0Xff
|
||||
|
@ -127,7 +127,7 @@ int orte_iof_proxy_unpublish(
|
||||
#if 0
|
||||
{
|
||||
int i = 0;
|
||||
opal_output(orte_iof_base.iof_output, "%s orted: ******** ABOUT TO IOF PROXY UNPUBLISH, %d", ORTE_NAME_PRINT(orte_process_info.my_name), getpid());
|
||||
opal_output(orte_iof_base.iof_output, "[%lu,%lu,%lu] orted: ******** ABOUT TO IOF PROXY UNPUBLISH, %d", ORTE_NAME_ARGS(orte_process_info.my_name), getpid());
|
||||
fflush(stderr);
|
||||
while (0 == i) sleep(5);
|
||||
}
|
||||
|
@ -114,8 +114,8 @@ orte_iof_svc_exception_handler(const orte_process_name_t* peer, orte_rml_excepti
|
||||
{
|
||||
orte_iof_base_endpoint_t *endpoint;
|
||||
opal_output(orte_iof_base.iof_output,
|
||||
"iof svc exception handler! %s\n",
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)peer));
|
||||
"iof svc exception handler! [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(peer));
|
||||
|
||||
/* If we detect an exception on the RML connection to a peer,
|
||||
delete all of its subscriptions and publications. Note that
|
||||
|
@ -174,10 +174,10 @@ static void orte_iof_svc_proxy_msg(
|
||||
/* if the subscription origin doesn't match the message's
|
||||
origin, skip this subscription */
|
||||
if(orte_ns.compare_fields(sub->origin_mask,&sub->origin_name,&hdr->msg_origin) == 0) {
|
||||
opal_output(orte_iof_base.iof_output, "sub origin %s, msg origin %s, msg proxy %s orte_iof_svc_proxy_msg: tag %d sequence %d, len %d\n",
|
||||
ORTE_NAME_PRINT(&sub->origin_name),
|
||||
ORTE_NAME_PRINT(&hdr->msg_origin),
|
||||
ORTE_NAME_PRINT(&hdr->msg_proxy),
|
||||
opal_output(orte_iof_base.iof_output, "sub origin [%lu,%lu,%lu], msg origin [%lu,%lu,%lu], msg proxy [%lu,%lu,%lu] orte_iof_svc_proxy_msg: tag %d sequence %d, len %d\n",
|
||||
ORTE_NAME_ARGS(&sub->origin_name),
|
||||
ORTE_NAME_ARGS(&hdr->msg_origin),
|
||||
ORTE_NAME_ARGS(&hdr->msg_proxy),
|
||||
hdr->msg_tag, hdr->msg_seq, hdr->msg_len);
|
||||
/* Everthing matched -- forward the message */
|
||||
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
|
||||
@ -239,10 +239,10 @@ static void orte_iof_svc_proxy_pub(
|
||||
orte_iof_base_pub_header_t* hdr)
|
||||
{
|
||||
int rc;
|
||||
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_pub: mask %d, tag %d, proc %s, proxy %s",
|
||||
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_pub: mask %d, tag %d, proc [%lu,%lu,%lu], proxy [%lu,%lu,%lu]",
|
||||
hdr->pub_mask, hdr->pub_tag,
|
||||
ORTE_NAME_PRINT(&hdr->pub_name),
|
||||
ORTE_NAME_PRINT(&hdr->pub_proxy));
|
||||
ORTE_NAME_ARGS(&hdr->pub_name),
|
||||
ORTE_NAME_ARGS(&hdr->pub_proxy));
|
||||
|
||||
rc = orte_iof_svc_pub_create(
|
||||
&hdr->pub_name,
|
||||
@ -264,10 +264,10 @@ static void orte_iof_svc_proxy_unpub(
|
||||
orte_iof_base_pub_header_t* hdr)
|
||||
{
|
||||
int rc;
|
||||
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_unpub: mask %d, tag %d, proc %s, proxy %s",
|
||||
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_unpub: mask %d, tag %d, proc [%lu,%lu,%lu], proxy [%lu,%lu,%lu]",
|
||||
hdr->pub_mask, hdr->pub_tag,
|
||||
ORTE_NAME_PRINT(&hdr->pub_name),
|
||||
ORTE_NAME_PRINT(&hdr->pub_proxy));
|
||||
ORTE_NAME_ARGS(&hdr->pub_name),
|
||||
ORTE_NAME_ARGS(&hdr->pub_proxy));
|
||||
|
||||
rc = orte_iof_svc_pub_delete(
|
||||
&hdr->pub_name,
|
||||
|
@ -54,8 +54,8 @@ int orte_iof_svc_pub_create(
|
||||
pub->pub_tag = pub_tag;
|
||||
pub->pub_endpoint =
|
||||
orte_iof_base_endpoint_match(pub_name,pub_mask,pub_tag);
|
||||
opal_output(orte_iof_base.iof_output, "created svc pub, name %s, proxy %s, tag %d / mask %x, endpoint %p\n",
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)pub_name), ORTE_NAME_PRINT((orte_process_name_t*)pub_proxy),
|
||||
opal_output(orte_iof_base.iof_output, "created svc pub, name [%lu,%lu,%lu], proxy [%lu,%lu,%lu], tag %d / mask %x, endpoint %p\n",
|
||||
ORTE_NAME_ARGS(pub_name), ORTE_NAME_ARGS(pub_proxy),
|
||||
pub_tag, pub_mask, (char*) pub->pub_endpoint);
|
||||
|
||||
/* append this published endpoint to any matching subscription */
|
||||
|
@ -104,9 +104,9 @@ int orte_iof_svc_sub_create(
|
||||
sub->target_mask = target_mask;
|
||||
sub->target_tag = target_tag;
|
||||
sub->sub_endpoint = orte_iof_base_endpoint_match(&sub->target_name, sub->target_mask, sub->target_tag);
|
||||
opal_output(orte_iof_base.iof_output, "created svc sub, origin %s tag %d / mask %x, target %s, tag %d / mask %x\n",
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)origin_name), origin_tag, origin_mask,
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)target_name), target_tag, target_mask);
|
||||
opal_output(orte_iof_base.iof_output, "created svc sub, origin [%lu,%lu,%lu] tag %d / mask %x, target [%lu,%lu,%lu], tag %d / mask %x\n",
|
||||
ORTE_NAME_ARGS(origin_name), origin_tag, origin_mask,
|
||||
ORTE_NAME_ARGS(target_name), target_tag, target_mask);
|
||||
|
||||
/* search through published endpoints for a match */
|
||||
for(item = opal_list_get_first(&mca_iof_svc_component.svc_published);
|
||||
@ -191,9 +191,9 @@ void orte_iof_svc_sub_ack(
|
||||
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)s_item;
|
||||
opal_list_item_t *f_item;
|
||||
|
||||
opal_output(orte_iof_base.iof_output, "ack: checking sub origin %s tag %d / mask %x, target %s, tag %d / mask %x\n",
|
||||
ORTE_NAME_PRINT(&sub->origin_name), sub->origin_tag, sub->origin_mask,
|
||||
ORTE_NAME_PRINT(&sub->target_name), sub->target_tag, sub->target_mask);
|
||||
opal_output(orte_iof_base.iof_output, "ack: checking sub origin [%lu,%lu,%lu] tag %d / mask %x, target [%lu,%lu,%lu], tag %d / mask %x\n",
|
||||
ORTE_NAME_ARGS(&sub->origin_name), sub->origin_tag, sub->origin_mask,
|
||||
ORTE_NAME_ARGS(&sub->target_name), sub->target_tag, sub->target_mask);
|
||||
|
||||
/* If the subscription origin/tag doesn't match the ACK
|
||||
origin/tag, skip it */
|
||||
@ -223,8 +223,8 @@ void orte_iof_svc_sub_ack(
|
||||
orte_iof_svc_pub_t* pub = fwd->fwd_pub;
|
||||
bool value_set = true;
|
||||
|
||||
opal_output(orte_iof_base.iof_output, "ack: checking fwd %s tag %d / mask %x\n",
|
||||
ORTE_NAME_PRINT(&pub->pub_name), pub->pub_tag, pub->pub_mask);
|
||||
opal_output(orte_iof_base.iof_output, "ack: checking fwd [%lu,%lu,%lu] tag %d / mask %x\n",
|
||||
ORTE_NAME_ARGS(&pub->pub_name), pub->pub_tag, pub->pub_mask);
|
||||
|
||||
/* If the publication origin or publication proxy matches
|
||||
the ACK'ing proxy, save the ACK'ed byte count for this
|
||||
@ -521,12 +521,12 @@ int orte_iof_svc_fwd_create(
|
||||
}
|
||||
OBJ_RETAIN(pub);
|
||||
fwd->fwd_pub = pub;
|
||||
opal_output(orte_iof_base.iof_output, "created svc forward, sub origin %s, tag %d / mask %x, sub target %s, tag %d / mask %x :::: pub name %s, tag %d / mask %x\n",
|
||||
ORTE_NAME_PRINT(&sub->origin_name), sub->origin_tag,
|
||||
opal_output(orte_iof_base.iof_output, "created svc forward, sub origin [%lu,%lu,%lu], tag %d / mask %x, sub target [%lu,%lu,%lu], tag %d / mask %x :::: pub name [%lu,%lu,%lu], tag %d / mask %x\n",
|
||||
ORTE_NAME_ARGS(&sub->origin_name), sub->origin_tag,
|
||||
sub->origin_mask,
|
||||
ORTE_NAME_PRINT(&sub->target_name), sub->target_tag,
|
||||
ORTE_NAME_ARGS(&sub->target_name), sub->target_tag,
|
||||
sub->target_mask,
|
||||
ORTE_NAME_PRINT(&pub->pub_name), pub->pub_tag, pub->pub_mask);
|
||||
ORTE_NAME_ARGS(&pub->pub_name), pub->pub_tag, pub->pub_mask);
|
||||
opal_list_append(&sub->sub_forward, &fwd->super);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -24,11 +24,10 @@ libmca_ns_la_SOURCES += \
|
||||
base/ns_base_close.c \
|
||||
base/ns_base_select.c \
|
||||
base/ns_base_open.c \
|
||||
base/ns_base_node_fns.c \
|
||||
base/ns_base_cell_fns.c \
|
||||
base/ns_base_job_fns.c \
|
||||
base/ns_base_vpid_name_fns.c \
|
||||
base/ns_base_general_fns.c \
|
||||
base/ns_base_print_name_args.c \
|
||||
base/ns_base_diag_fns.c \
|
||||
base/data_type_support/ns_data_type_compare_fns.c \
|
||||
base/data_type_support/ns_data_type_copy_fns.c \
|
||||
|
@ -49,7 +49,6 @@ extern "C" {
|
||||
ORTE_DECLSPEC int orte_ns_base_open(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_select(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_close(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_init_print_args(void);
|
||||
|
||||
/*
|
||||
* globals that might be needed
|
||||
|
@ -49,6 +49,18 @@ int orte_ns_base_compare_name(orte_process_name_t *value1,
|
||||
* value - a totally useless result, but consistent in behavior.
|
||||
*/
|
||||
|
||||
/** check the cellids - if one of them is WILDCARD, then ignore
|
||||
* this field since anything is okay
|
||||
*/
|
||||
if (value1->cellid != ORTE_CELLID_WILDCARD &&
|
||||
value2->cellid != ORTE_CELLID_WILDCARD) {
|
||||
if (value1->cellid < value2->cellid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (value1->cellid > value2->cellid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
}
|
||||
|
||||
/** check the jobids - if one of them is WILDCARD, then ignore
|
||||
* this field since anything is okay
|
||||
*/
|
||||
@ -108,6 +120,21 @@ int orte_ns_base_compare_jobid(orte_jobid_t *value1,
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
int orte_ns_base_compare_cellid(orte_cellid_t *value1,
|
||||
orte_cellid_t *value2,
|
||||
orte_data_type_t type)
|
||||
{
|
||||
/** if either value is WILDCARD, then return equal */
|
||||
if (*value1 == ORTE_CELLID_WILDCARD ||
|
||||
*value2 == ORTE_CELLID_WILDCARD) return ORTE_EQUAL;
|
||||
|
||||
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
|
||||
|
||||
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
|
||||
|
||||
return ORTE_EQUAL;
|
||||
}
|
||||
|
||||
int orte_ns_base_compare_nodeid(orte_nodeid_t *value1,
|
||||
orte_nodeid_t *value2,
|
||||
orte_data_type_t type)
|
||||
|
@ -44,6 +44,25 @@ int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* CELLID
|
||||
*/
|
||||
int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type)
|
||||
{
|
||||
orte_cellid_t *val;
|
||||
|
||||
val = (orte_cellid_t*)malloc(sizeof(orte_cellid_t));
|
||||
if (NULL == val) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
*val = *src;
|
||||
*dest = val;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* NODEID
|
||||
*/
|
||||
@ -95,6 +114,7 @@ int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src,
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
val->cellid = src->cellid;
|
||||
val->jobid = src->jobid;
|
||||
val->vpid = src->vpid;
|
||||
|
||||
|
@ -39,9 +39,30 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, const void *src,
|
||||
int rc;
|
||||
orte_std_cntr_t i;
|
||||
orte_process_name_t* proc;
|
||||
orte_cellid_t *cellid;
|
||||
orte_jobid_t *jobid;
|
||||
orte_vpid_t *vpid;
|
||||
|
||||
/* collect all the cellids in a contiguous array */
|
||||
cellid = (orte_cellid_t*)malloc(num_vals * sizeof(orte_cellid_t));
|
||||
if (NULL == cellid) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
proc = (orte_process_name_t*)src;
|
||||
for (i=0; i < num_vals; i++) {
|
||||
cellid[i] = proc->cellid;
|
||||
proc++;
|
||||
}
|
||||
/* now pack them in one shot */
|
||||
if (ORTE_SUCCESS != (rc =
|
||||
orte_ns_base_pack_cellid(buffer, cellid, num_vals, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(cellid);
|
||||
return rc;
|
||||
}
|
||||
free(cellid);
|
||||
|
||||
/* collect all the jobids in a contiguous array */
|
||||
jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t));
|
||||
if (NULL == jobid) {
|
||||
@ -85,6 +106,23 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, const void *src,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* CELLID
|
||||
*/
|
||||
int orte_ns_base_pack_cellid(orte_buffer_t *buffer, const void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Turn around and pack the real type */
|
||||
if (ORTE_SUCCESS != (
|
||||
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_CELLID_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* NODEID
|
||||
*/
|
||||
|
@ -45,6 +45,10 @@ int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_typ
|
||||
orte_ns_base_quick_print(output, "ORTE_JOBID", prefix, src, sizeof(orte_jobid_t));
|
||||
break;
|
||||
|
||||
case ORTE_CELLID:
|
||||
orte_ns_base_quick_print(output, "ORTE_CELLID", prefix, src, sizeof(orte_cellid_t));
|
||||
break;
|
||||
|
||||
case ORTE_NODEID:
|
||||
orte_ns_base_quick_print(output, "ORTE_NODEID", prefix, src, sizeof(orte_nodeid_t));
|
||||
break;
|
||||
@ -69,8 +73,8 @@ int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *na
|
||||
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: NULL",
|
||||
(NULL == prefix ? " " : prefix));
|
||||
} else {
|
||||
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%ld,%ld]",
|
||||
(NULL == prefix ? " " : prefix),
|
||||
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%ld,%ld,%ld]",
|
||||
(NULL == prefix ? " " : prefix), (long)name->cellid,
|
||||
(long)name->jobid, (long)name->vpid);
|
||||
}
|
||||
|
||||
|
@ -40,6 +40,10 @@ int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type)
|
||||
*size = sizeof(orte_jobid_t);
|
||||
break;
|
||||
|
||||
case ORTE_CELLID:
|
||||
*size = sizeof(orte_cellid_t);
|
||||
break;
|
||||
|
||||
case ORTE_NODEID:
|
||||
*size = sizeof(orte_nodeid_t);
|
||||
break;
|
||||
|
@ -37,16 +37,34 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
|
||||
int rc;
|
||||
orte_std_cntr_t i, num;
|
||||
orte_process_name_t* proc;
|
||||
orte_cellid_t *cellid;
|
||||
orte_jobid_t *jobid;
|
||||
orte_vpid_t *vpid;
|
||||
|
||||
num = *num_vals;
|
||||
|
||||
/* allocate space for all the cellids in a contiguous array */
|
||||
cellid = (orte_cellid_t*)malloc(num * sizeof(orte_cellid_t));
|
||||
if (NULL == cellid) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
*num_vals = 0;
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* now unpack them in one shot */
|
||||
if (ORTE_SUCCESS != (rc =
|
||||
orte_ns_base_unpack_cellid(buffer, cellid, num_vals, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
*num_vals = 0;
|
||||
free(cellid);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* allocate space for all the jobids in a contiguous array */
|
||||
jobid = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t));
|
||||
if (NULL == jobid) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
*num_vals = 0;
|
||||
free(cellid);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* now unpack them in one shot */
|
||||
@ -55,6 +73,7 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
*num_vals = 0;
|
||||
free(jobid);
|
||||
free(cellid);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -64,6 +83,7 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
*num_vals = 0;
|
||||
free(jobid);
|
||||
free(cellid);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* now unpack them in one shot */
|
||||
@ -73,12 +93,14 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
|
||||
*num_vals = 0;
|
||||
free(vpid);
|
||||
free(jobid);
|
||||
free(cellid);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* build the names from the jobid/vpid arrays */
|
||||
/* build the names from the cellid/jobid/vpid arrays */
|
||||
proc = (orte_process_name_t*)dest;
|
||||
for (i=0; i < num; i++) {
|
||||
proc->cellid = cellid[i];
|
||||
proc->jobid = jobid[i];
|
||||
proc->vpid = vpid[i];
|
||||
proc++;
|
||||
@ -87,10 +109,27 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
|
||||
/* cleanup */
|
||||
free(vpid);
|
||||
free(jobid);
|
||||
free(cellid);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* CELLID
|
||||
*/
|
||||
int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Turn around and unpack the real type */
|
||||
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_CELLID_T))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* NODEID
|
||||
*/
|
||||
|
@ -44,7 +44,25 @@
|
||||
* "not available" functions
|
||||
*/
|
||||
int
|
||||
orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodename)
|
||||
orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, char *site, char *resource)
|
||||
{
|
||||
*cellid = ORTE_CELLID_INVALID;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
int
|
||||
orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
|
||||
char **site, char **resource)
|
||||
{
|
||||
*site = NULL;
|
||||
*resource = NULL;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
int
|
||||
orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, orte_cellid_t cellid, char **nodename)
|
||||
{
|
||||
*nodeids = NULL;
|
||||
*nnodes = 0;
|
||||
@ -53,7 +71,8 @@ orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr
|
||||
}
|
||||
|
||||
int
|
||||
orte_ns_base_get_node_info_not_available(char ***nodenames, orte_std_cntr_t num_nodeids, orte_nodeid_t *nodeids)
|
||||
orte_ns_base_get_node_info_not_available(char ***nodenames, orte_cellid_t cellid,
|
||||
orte_std_cntr_t num_nodeids, orte_nodeid_t *nodeids)
|
||||
{
|
||||
*nodenames = NULL;
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
||||
@ -61,6 +80,95 @@ orte_ns_base_get_node_info_not_available(char ***nodenames, orte_std_cntr_t num_
|
||||
}
|
||||
|
||||
|
||||
/**** CELL STRING FUNCTIONS ****/
|
||||
int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name)
|
||||
{
|
||||
if (NULL == name) { /* got an error */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
*cellid_string = NULL;
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* check for wildcard value - handle appropriately */
|
||||
if (ORTE_CELLID_WILDCARD == name->cellid) {
|
||||
*cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* check for invalid value - handle appropriately */
|
||||
if (ORTE_CELLID_INVALID == name->cellid) {
|
||||
*cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (0 > asprintf(cellid_string, "%ld", (long) name->cellid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid)
|
||||
{
|
||||
/* check for wildcard value - handle appropriately */
|
||||
if (ORTE_CELLID_WILDCARD == cellid) {
|
||||
*cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* check for invalid value - handle appropriately */
|
||||
if (ORTE_CELLID_INVALID == cellid) {
|
||||
*cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (0 > asprintf(cellid_string, "%ld", (long) cellid)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring)
|
||||
{
|
||||
long int tmpint;
|
||||
|
||||
if (NULL == cellidstring) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
*cellid = ORTE_CELLID_INVALID;
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/** check for wildcard string - handle appropriately */
|
||||
if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, cellidstring)) {
|
||||
*cellid = ORTE_CELLID_WILDCARD;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/** check for invalid string - handle appropriately */
|
||||
if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, cellidstring)) {
|
||||
*cellid = ORTE_CELLID_INVALID;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
tmpint = strtol(cellidstring, NULL, 10);
|
||||
|
||||
if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) {
|
||||
*cellid = (orte_cellid_t)tmpint;
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
*cellid = ORTE_CELLID_INVALID;
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**** NODEID STRING FUNCTIONS ****/
|
||||
int orte_ns_base_convert_nodeid_to_string(char **string, const orte_nodeid_t nodeid)
|
||||
{
|
@ -45,6 +45,13 @@
|
||||
* "not available" functions
|
||||
*/
|
||||
|
||||
int
|
||||
orte_ns_base_dump_cells_not_available(void)
|
||||
{
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
int
|
||||
orte_ns_base_dump_jobs_not_available(void)
|
||||
{
|
||||
|
@ -44,9 +44,9 @@
|
||||
* globals
|
||||
*/
|
||||
|
||||
orte_process_name_t orte_ns_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
|
||||
orte_process_name_t orte_ns_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
||||
orte_process_name_t orte_ns_name_my_hnp = {0, 0};
|
||||
orte_process_name_t orte_ns_name_wildcard = {ORTE_CELLID_WILDCARD, ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
|
||||
orte_process_name_t orte_ns_name_invalid = {ORTE_CELLID_INVALID, ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
||||
orte_process_name_t orte_ns_name_my_hnp = {0, 0, 0};
|
||||
|
||||
/*
|
||||
* Global variables
|
||||
@ -55,6 +55,12 @@ int mca_ns_base_output = -1;
|
||||
mca_ns_base_module_t orte_ns = {
|
||||
/* init */
|
||||
orte_ns_base_module_init_not_available,
|
||||
/* cell functions */
|
||||
orte_ns_base_create_cellid_not_available,
|
||||
orte_ns_base_get_cell_info_not_available,
|
||||
orte_ns_base_get_cellid_string,
|
||||
orte_ns_base_convert_cellid_to_string,
|
||||
orte_ns_base_convert_string_to_cellid,
|
||||
/* node functions */
|
||||
orte_ns_base_create_nodeids_not_available,
|
||||
orte_ns_base_get_node_info_not_available,
|
||||
@ -89,6 +95,7 @@ mca_ns_base_module_t orte_ns = {
|
||||
/* data type functions */
|
||||
orte_ns_base_define_data_type_not_available,
|
||||
/* diagnostic functions */
|
||||
orte_ns_base_dump_cells_not_available,
|
||||
orte_ns_base_dump_jobs_not_available,
|
||||
orte_ns_base_dump_tags_not_available,
|
||||
orte_ns_base_dump_datatypes_not_available,
|
||||
@ -150,12 +157,6 @@ int orte_ns_base_open(void)
|
||||
}
|
||||
mca_ns_base_output = opal_output_open(&kill_prefix);
|
||||
|
||||
/* setup the print_args function */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_init_print_args())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* register the base system types with the DPS */
|
||||
tmp = ORTE_NAME;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_name,
|
||||
@ -199,7 +200,21 @@ int orte_ns_base_open(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Open up all available components */
|
||||
tmp = ORTE_CELLID;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_cellid,
|
||||
orte_ns_base_unpack_cellid,
|
||||
(orte_dss_copy_fn_t)orte_ns_base_copy_cellid,
|
||||
(orte_dss_compare_fn_t)orte_ns_base_compare_cellid,
|
||||
(orte_dss_size_fn_t)orte_ns_base_std_size,
|
||||
(orte_dss_print_fn_t)orte_ns_base_std_print,
|
||||
(orte_dss_release_fn_t)orte_ns_base_std_release,
|
||||
ORTE_DSS_UNSTRUCTURED,
|
||||
"ORTE_CELLID", &tmp))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Open up all available components */
|
||||
|
||||
if (ORTE_SUCCESS !=
|
||||
mca_base_components_open("ns", mca_ns_base_output,
|
||||
|
@ -1,82 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
|
||||
#define ORTE_PRINT_NAME_ARGS_MAX_SIZE 20
|
||||
|
||||
static opal_tsd_key_t print_args_tsd_key;
|
||||
char* orte_print_args_null = "NULL";
|
||||
|
||||
static void
|
||||
buffer_cleanup(void *value)
|
||||
{
|
||||
if (NULL != value) free(value);
|
||||
}
|
||||
|
||||
static char*
|
||||
get_print_name_buffer(void)
|
||||
{
|
||||
void *buffer;
|
||||
int ret;
|
||||
|
||||
ret = opal_tsd_getspecific(print_args_tsd_key, &buffer);
|
||||
if (OPAL_SUCCESS != ret) return NULL;
|
||||
|
||||
if (NULL == buffer) {
|
||||
buffer = (void*) malloc((ORTE_PRINT_NAME_ARGS_MAX_SIZE+1) * sizeof(char));
|
||||
ret = opal_tsd_setspecific(print_args_tsd_key, buffer);
|
||||
}
|
||||
|
||||
return (char*) buffer;
|
||||
}
|
||||
|
||||
char* orte_ns_base_print_name_args(orte_process_name_t *name)
|
||||
{
|
||||
char *print_name_buf = get_print_name_buffer();
|
||||
|
||||
if (NULL == print_name_buf) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return orte_print_args_null;
|
||||
}
|
||||
|
||||
if (NULL == name) {
|
||||
snprintf(print_name_buf, ORTE_PRINT_NAME_ARGS_MAX_SIZE, "[NO-NAME]");
|
||||
} else {
|
||||
snprintf(print_name_buf, ORTE_PRINT_NAME_ARGS_MAX_SIZE, "[%ld,%ld]", (long)name->jobid, (long)name->vpid);
|
||||
}
|
||||
return print_name_buf;
|
||||
}
|
||||
|
||||
int
|
||||
orte_ns_base_init_print_args(void)
|
||||
{
|
||||
return opal_tsd_key_create(&print_args_tsd_key, buffer_cleanup);
|
||||
}
|
@ -55,7 +55,7 @@ orte_ns_base_create_my_name_not_available(void)
|
||||
int orte_ns_base_get_proc_name_string(char **name_string,
|
||||
const orte_process_name_t* name)
|
||||
{
|
||||
char *tmp;
|
||||
char *tmp, *tmp2;
|
||||
|
||||
if (NULL == name) { /* got an error */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
@ -66,23 +66,32 @@ int orte_ns_base_get_proc_name_string(char **name_string,
|
||||
* corresponding string so we can correctly parse the name string when
|
||||
* it is passed back to us later
|
||||
*/
|
||||
if (ORTE_JOBID_WILDCARD == name->jobid) {
|
||||
asprintf(&tmp, "%s", ORTE_SCHEMA_WILDCARD_STRING);
|
||||
} else if (ORTE_JOBID_INVALID == name->jobid) {
|
||||
asprintf(&tmp, "%s", ORTE_SCHEMA_INVALID_STRING);
|
||||
if (ORTE_CELLID_WILDCARD == name->cellid) {
|
||||
tmp = strdup(ORTE_SCHEMA_WILDCARD_STRING);
|
||||
} else if (ORTE_CELLID_INVALID == name->cellid) {
|
||||
tmp = strdup(ORTE_SCHEMA_INVALID_STRING);
|
||||
} else {
|
||||
asprintf(&tmp, "%ld", (long)name->jobid);
|
||||
asprintf(&tmp, "%ld", (long)name->cellid);
|
||||
}
|
||||
|
||||
if (ORTE_VPID_WILDCARD == name->vpid) {
|
||||
asprintf(name_string, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
|
||||
} else if (ORTE_VPID_INVALID == name->vpid) {
|
||||
asprintf(name_string, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
|
||||
if (ORTE_JOBID_WILDCARD == name->jobid) {
|
||||
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
|
||||
} else if (ORTE_JOBID_INVALID == name->jobid) {
|
||||
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
|
||||
} else {
|
||||
asprintf(name_string, "%s%c%ld", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (long)name->vpid);
|
||||
asprintf(&tmp2, "%s%c%ld", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (long)name->jobid);
|
||||
}
|
||||
free(tmp);
|
||||
|
||||
if (ORTE_VPID_WILDCARD == name->vpid) {
|
||||
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
|
||||
} else if (ORTE_VPID_INVALID == name->vpid) {
|
||||
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
|
||||
} else {
|
||||
asprintf(name_string, "%s%c%ld", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, (long)name->vpid);
|
||||
}
|
||||
free(tmp2);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -90,6 +99,7 @@ int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
|
||||
const char* name_string)
|
||||
{
|
||||
char *temp, *token;
|
||||
orte_cellid_t cell;
|
||||
orte_jobid_t job;
|
||||
orte_vpid_t vpid;
|
||||
long int tmpint;
|
||||
@ -102,16 +112,45 @@ int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
|
||||
}
|
||||
|
||||
temp = strdup(name_string); /** copy input string as the strtok process is destructive */
|
||||
token = strtok(temp, ORTE_SCHEMA_DELIMITER_STRING); /** get first field -> jobid */
|
||||
token = strtok(temp, ORTE_SCHEMA_DELIMITER_STRING); /** get first field -> cellid */
|
||||
|
||||
/* check for error */
|
||||
if (NULL == token) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* convert to largest possible int - then
|
||||
* check to ensure it is within range of cellid_t before casting
|
||||
*/
|
||||
|
||||
/* first, though, check for WILDCARD character - assign
|
||||
* value accordingly, if found
|
||||
*/
|
||||
if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) {
|
||||
cell = ORTE_CELLID_WILDCARD;
|
||||
} else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) {
|
||||
cell = ORTE_CELLID_INVALID;
|
||||
} else {
|
||||
tmpint = strtol(token, NULL, 10);
|
||||
if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) {
|
||||
cell = (orte_cellid_t)tmpint;
|
||||
} else {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return_code = ORTE_ERR_BAD_PARAM;
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> jobid */
|
||||
|
||||
/** convert to largest possible int - then
|
||||
* check to ensure it is within range of jobid_t before casting */
|
||||
|
||||
/* check for error */
|
||||
if (NULL == token) {
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/** first, though, check for WILDCARD character - assign
|
||||
* value accordingly, if found
|
||||
*/
|
||||
@ -159,7 +198,7 @@ int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (return_code =
|
||||
orte_ns_base_create_process_name(name, job, vpid))) {
|
||||
orte_ns_base_create_process_name(name, cell, job, vpid))) {
|
||||
ORTE_ERROR_LOG(return_code);
|
||||
}
|
||||
|
||||
@ -171,6 +210,7 @@ CLEANUP:
|
||||
|
||||
/**** CREATE PROCESS NAME ****/
|
||||
int orte_ns_base_create_process_name(orte_process_name_t **name,
|
||||
orte_cellid_t cell,
|
||||
orte_jobid_t job,
|
||||
orte_vpid_t vpid)
|
||||
{
|
||||
@ -182,6 +222,7 @@ int orte_ns_base_create_process_name(orte_process_name_t **name,
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
(*name)->cellid = cell;
|
||||
(*name)->jobid = job;
|
||||
(*name)->vpid = vpid;
|
||||
return ORTE_SUCCESS;
|
||||
@ -296,8 +337,16 @@ int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
|
||||
* function does not actually stand for a wildcard value, but
|
||||
* rather a specific value
|
||||
*/
|
||||
|
||||
/* check job id */
|
||||
if (ORTE_NS_CMP_CELLID & fields) { /* check cellid field */
|
||||
if (name1->cellid < name2->cellid) {
|
||||
return ORTE_VALUE2_GREATER;
|
||||
} else if (name1->cellid > name2->cellid) {
|
||||
return ORTE_VALUE1_GREATER;
|
||||
}
|
||||
}
|
||||
|
||||
/* get here if cellid's are equal, or cellid not being checked */
|
||||
/* now check job id */
|
||||
|
||||
if (ORTE_NS_CMP_JOBID & fields) {
|
||||
if (name1->jobid < name2->jobid) {
|
||||
@ -307,7 +356,8 @@ int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
|
||||
}
|
||||
}
|
||||
|
||||
/* get here if jobid's are equal, or not being checked
|
||||
/* get here if cellid's and jobid's are equal, or neither being checked,
|
||||
* or cellid not checked and jobid's equal.
|
||||
* now check vpid
|
||||
*/
|
||||
|
||||
@ -320,7 +370,8 @@ int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
|
||||
}
|
||||
|
||||
/* only way to get here is if all fields are being checked and are equal,
|
||||
* or jobid not checked, but vpid equal,
|
||||
* or cellid not checked, but jobid and vpid equal,
|
||||
* or cellid and jobid not checked, but vpid equal,
|
||||
* only vpid being checked, and equal
|
||||
* return that fact
|
||||
*/
|
||||
|
@ -57,6 +57,7 @@ typedef uint8_t orte_ns_cmd_flag_t;
|
||||
* typedefs above and in ns_types.h
|
||||
*/
|
||||
#define ORTE_NS_CMD ORTE_INT8
|
||||
#define ORTE_CELLID_T ORTE_INT32
|
||||
#define ORTE_NODEID_T ORTE_INT32
|
||||
#define ORTE_JOBID_T ORTE_INT32
|
||||
#define ORTE_VPID_T ORTE_INT32
|
||||
@ -64,6 +65,8 @@ typedef uint8_t orte_ns_cmd_flag_t;
|
||||
/*
|
||||
* define flag values for remote commands - only used internally
|
||||
*/
|
||||
#define ORTE_NS_CREATE_CELLID_CMD (int8_t) 1
|
||||
#define ORTE_NS_GET_CELL_INFO_CMD (int8_t) 2
|
||||
#define ORTE_NS_CREATE_NODEID_CMD (int8_t) 3
|
||||
#define ORTE_NS_GET_NODE_INFO_CMD (int8_t) 4
|
||||
#define ORTE_NS_CREATE_JOBID_CMD (int8_t) 5
|
||||
@ -76,6 +79,7 @@ typedef uint8_t orte_ns_cmd_flag_t;
|
||||
#define ORTE_NS_GET_PEERS_CMD (int8_t) 12
|
||||
#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t) 13
|
||||
#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t) 14
|
||||
#define ORTE_NS_DUMP_CELLS_CMD (int8_t) 15
|
||||
#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t) 16
|
||||
#define ORTE_NS_DUMP_TAGS_CMD (int8_t) 17
|
||||
#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t) 18
|
||||
@ -88,6 +92,7 @@ typedef uint8_t orte_ns_cmd_flag_t;
|
||||
*/
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_create_process_name(orte_process_name_t **name,
|
||||
orte_cellid_t cell,
|
||||
orte_jobid_t job,
|
||||
orte_vpid_t vpid);
|
||||
|
||||
@ -109,11 +114,19 @@ ORTE_DECLSPEC int orte_ns_base_convert_jobid_to_string(char **jobid_string, c
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *nodeid, const char *string);
|
||||
ORTE_DECLSPEC int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *cellid, const char *string);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_convert_nodeid_to_string(char **nodeid_string, const orte_nodeid_t nodeid);
|
||||
|
||||
@ -127,10 +140,16 @@ ORTE_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer);
|
||||
/* not available functions */
|
||||
ORTE_DECLSPEC int orte_ns_base_module_init_not_available(void);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
|
||||
char **nodename);
|
||||
ORTE_DECLSPEC int orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid,
|
||||
char *site, char *resource);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_get_node_info_not_available(char ***nodename,
|
||||
ORTE_DECLSPEC int orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
|
||||
char **site, char **resource);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
|
||||
orte_cellid_t cellid, char **nodename);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_get_node_info_not_available(char ***nodename, orte_cellid_t cellid,
|
||||
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid, opal_list_t *attrs);
|
||||
@ -167,6 +186,7 @@ ORTE_DECLSPEC int orte_ns_base_create_my_name_not_available(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_get_peers_not_available(orte_process_name_t **procs,
|
||||
orte_std_cntr_t *num_procs, opal_list_t *attributes);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_dump_cells_not_available(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_dump_jobs_not_available(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_dump_tags_not_available(void);
|
||||
ORTE_DECLSPEC int orte_ns_base_dump_datatypes_not_available(void);
|
||||
@ -178,6 +198,9 @@ ORTE_DECLSPEC int orte_ns_base_ft_event_not_available(int state);
|
||||
ORTE_DECLSPEC int orte_ns_base_pack_name(orte_buffer_t *buffer, const void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_pack_cellid(orte_buffer_t *buffer, const void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_pack_nodeid(orte_buffer_t *buffer, const void *src,
|
||||
orte_std_cntr_t num_vals, orte_data_type_t type);
|
||||
|
||||
@ -190,6 +213,9 @@ ORTE_DECLSPEC int orte_ns_base_pack_vpid(orte_buffer_t *buffer, const void *s
|
||||
ORTE_DECLSPEC int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
ORTE_DECLSPEC int orte_ns_base_unpack_nodeid(orte_buffer_t *buffer, void *dest,
|
||||
orte_std_cntr_t *num_vals, orte_data_type_t type);
|
||||
|
||||
@ -207,6 +233,8 @@ int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src,
|
||||
|
||||
int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_ns_base_copy_nodeid(orte_nodeid_t **dest, orte_nodeid_t *src, orte_data_type_t type);
|
||||
|
||||
int orte_ns_base_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, orte_data_type_t type);
|
||||
@ -228,6 +256,10 @@ int orte_ns_base_compare_jobid(orte_jobid_t *value1,
|
||||
orte_jobid_t *value2,
|
||||
orte_data_type_t type);
|
||||
|
||||
int orte_ns_base_compare_cellid(orte_cellid_t *value1,
|
||||
orte_cellid_t *value2,
|
||||
orte_data_type_t type);
|
||||
|
||||
int orte_ns_base_compare_nodeid(orte_nodeid_t *value1,
|
||||
orte_nodeid_t *value2,
|
||||
orte_data_type_t type);
|
||||
|
148
orte/mca/ns/ns.h
148
orte/mca/ns/ns.h
@ -59,25 +59,116 @@ extern "C" {
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_init_fn_t)(void);
|
||||
|
||||
/**** CELL FUNCTIONS ****/
|
||||
/**
|
||||
* Create a new cell id.
|
||||
* Allocates a new cell id for use by the caller. The function returns an
|
||||
* existing cellid if the specified site/resource already has been assigned
|
||||
* one.
|
||||
*
|
||||
* @param site The name of the site where the cell is located.
|
||||
* @param resource The name of the resource associated with this cell (e.g., the name
|
||||
* of the cluster).
|
||||
* @param cellid The location where the cellid is to be stored.
|
||||
*
|
||||
* @retval ORTE_SUCCESS A cellid was created and returned.
|
||||
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
|
||||
*
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_create_cellid_fn_t)(orte_cellid_t *cellid,
|
||||
char *site, char *resource);
|
||||
|
||||
/**
|
||||
* Get cell info
|
||||
* Retrieve the site and resource info on a cell.
|
||||
*
|
||||
* @param cellid The id of the cell who's info is being requested.
|
||||
* @param site Returns a pointer to a strdup'd string containing the site name.
|
||||
* @param resource Returns a pointer to a strdup'd string containg the resource name.
|
||||
* @retval ORTE_SUCCESS A cellid was created and returned.
|
||||
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid,
|
||||
char **site, char **resource);
|
||||
|
||||
/**
|
||||
* Get the cell id as a character string.
|
||||
* The get_cellid_string() function returns the cell id in a character string
|
||||
* representation. The string is created by expressing the field in hexadecimal. Memory
|
||||
* for the string is allocated by the function - releasing that allocation is the
|
||||
* responsibility of the calling program.
|
||||
*
|
||||
* @param *name A pointer to the name structure containing the name to be
|
||||
* "translated" to a string.
|
||||
*
|
||||
* @retval *name_string A pointer to the character string representation of the
|
||||
* cell id.
|
||||
* @retval NULL Indicates an error occurred - either no memory could be allocated
|
||||
* or the caller provided an incorrect name pointer (e.g., NULL).
|
||||
*
|
||||
* @code
|
||||
* cellid-string = ompi_name_server.get_cellid_string(&name)
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name);
|
||||
|
||||
/**
|
||||
* Convert cellid to character string
|
||||
* Returns the cellid in a character string representation. The string is created
|
||||
* by expressing the provided cellid in hexadecimal. Memory for the string is
|
||||
* allocated by the function - releasing that allocation is the responsibility of
|
||||
* the calling program.
|
||||
*
|
||||
* @param cellid The cellid to be converted.
|
||||
*
|
||||
* @retval *cellid_string A pointer to a character string representation of the cellid.
|
||||
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
|
||||
*
|
||||
* @code
|
||||
* cellid-string = ompi_name_server.convert_cellid_to_string(cellid);
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid);
|
||||
|
||||
/**
|
||||
* Convert a string to a cellid.
|
||||
* Converts a characters string into a cellid. The character string must be a
|
||||
* hexadecimal representation of a valid cellid.
|
||||
*
|
||||
* @param cellidstring The string to be converted.
|
||||
*
|
||||
* @retval cellid The resulting cellid
|
||||
* @retval MCA_NS_BASE_CELLID_MAX String could not be converted.
|
||||
*
|
||||
* @code
|
||||
* cellid = ompi_name_server.convert_string_to_cellid(cellidstring);
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
|
||||
|
||||
|
||||
/**** NODE FUNCTIONS ****/
|
||||
/*
|
||||
* Get an array of node id's
|
||||
* Given a NULL-terminated array of names of nodes within it, this function assigns an id to represent
|
||||
* each node.
|
||||
* Given the cell and a NULL-terminated array of names of nodes within it, this function assigns an id to represent
|
||||
* each node within the cell.
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_create_nodeids_fn_t)(orte_nodeid_t **nodes, orte_std_cntr_t *nnodes,
|
||||
char **nodenames);
|
||||
orte_cellid_t cellid, char **nodename);
|
||||
|
||||
/*
|
||||
* Get node info
|
||||
* Retrieve the names of an array of nodes given their nodeids.
|
||||
* Retrieve the names of an array of nodes given their cellid and nodeids. The cellid
|
||||
* is required as the nodeids are only unique within a given cell.
|
||||
*
|
||||
* @param cellid The id of the cell of the node.
|
||||
* @param nodeids The ids of the node.
|
||||
* @param nodenames Returns a pointer to a NULL-terminated array of strdup'd strings containing the node names.
|
||||
* @retval ORTE_SUCCESS The nodename was created and returned.
|
||||
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_get_node_info_fn_t)(char ***nodename,
|
||||
typedef int (*orte_ns_base_module_get_node_info_fn_t)(char ***nodename, orte_cellid_t cellid,
|
||||
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
|
||||
|
||||
/*
|
||||
@ -169,6 +260,8 @@ typedef int (*orte_ns_base_module_get_parent_job_fn_t)(orte_jobid_t *parent, ort
|
||||
/**
|
||||
* Reserve a range of process id's.
|
||||
* The reserve_range() function reserves a range of vpid's for the given jobid.
|
||||
* Note that the cellid does not factor into this request - jobid's span the entire universe,
|
||||
* hence the cell where the process is currently executing is irrelevant to this request.
|
||||
*
|
||||
* @param jobid The id of the job for which the vpid's are to be reserved.
|
||||
* @param range The number of vpid's to be reserved. The function will find the
|
||||
@ -257,7 +350,13 @@ typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jo
|
||||
* The create_process_name() function creates a single process name structure and fills the
|
||||
* fields with the provided values.
|
||||
*
|
||||
* @param job The id of the job to which the process will belong.
|
||||
* @param cell The cell for which the process name is intended. Usually, this is
|
||||
* the id of the cell where the process is initially planning to be spawned.
|
||||
* @param job The id of the job to which the process will belong. Process id's are
|
||||
* tracked according to jobid, but not cellid. Thus, two processes
|
||||
* can have the same process id if and only if they have different jobid's. However,
|
||||
* two processes in the same jobid cannot have the same process id, regardless
|
||||
* of whether or not they are in the same cell.
|
||||
* @param vpid The virtual process id for the name. Note that no check is made for uniqueness -
|
||||
* the caller is responsible for ensuring that the requested name is, in fact, unique
|
||||
* by first requesting reservation of an appropriate range of virtual process id's.
|
||||
@ -271,6 +370,7 @@ typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jo
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name,
|
||||
orte_cellid_t cell,
|
||||
orte_jobid_t job,
|
||||
orte_vpid_t vpid);
|
||||
|
||||
@ -289,14 +389,17 @@ typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
|
||||
* Convert a string representation to a process name.
|
||||
* The convert_string_to_process_name() function converts a string representation of a process
|
||||
* name into an Open MPI name structure. The string must be of the proper form - i.e., it
|
||||
* must be in the form "jobid.vpid", where each field is expressed in hexadecimal form.
|
||||
* must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form.
|
||||
*
|
||||
* @param *name_string A character string representation of a process name.
|
||||
*
|
||||
* @retval *name Pointer to an orte_process_name_t structure containing the name.
|
||||
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
|
||||
* @retval NULL Indicates an error, probably due to inability to allocate memory for
|
||||
* the name structure.
|
||||
*
|
||||
* @code
|
||||
* name = ompi_name_server.convert_string_to_process_name(name_string);
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name,
|
||||
const char* name_string);
|
||||
@ -305,7 +408,10 @@ typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_proc
|
||||
/**
|
||||
* Get the process name as a character string.
|
||||
* The get_proc_name_string() function returns the entire process name in a
|
||||
* character string representation.
|
||||
* character string representation. The string is created by expressing each
|
||||
* field in hexadecimal separated by periods, as follows:
|
||||
*
|
||||
* sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid)
|
||||
*
|
||||
* The memory required for the string is allocated by the function - releasing
|
||||
* that allocation is the responsibility of the calling program.
|
||||
@ -330,13 +436,13 @@ typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
|
||||
* The compare() function checks the value of the fields in the two
|
||||
* provided names, and returns a value indicating if the first one is less than, greater
|
||||
* than, or equal to the second. The value of each field is compared in a hierarchical
|
||||
* fashion, with jobid and vpid in sequence. The bit-mask
|
||||
* fashion, with cellid first, followed by jobid and vpid in sequence. The bit-mask
|
||||
* indicates which fields are to be included in the comparison. Fields not included via the
|
||||
* bit-mask are ignored. Thus, the caller may request that any combination of the two fields
|
||||
* bit-mask are ignored. Thus, the caller may request that any combination of the three fields
|
||||
* be included in the comparison.
|
||||
*
|
||||
* @param fields A bit-mask indicating which fields are to be included in the comparison. The
|
||||
* comparison is performed on a hierarchical basis, with
|
||||
* comparison is performed on a hierarchical basis, with cellid being first, followed by
|
||||
* jobid and then vpid. Each field can be included separately, thus allowing the caller
|
||||
* to configure the comparison to meet their needs.
|
||||
* @param *name1 A pointer to the first name structure.
|
||||
@ -348,6 +454,11 @@ typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
|
||||
* @retval +1 The indicated fields of the first provided name is greater than the same
|
||||
* fields of the second provided name.
|
||||
*
|
||||
* The function returns a large negative value if there is an error.
|
||||
*
|
||||
* @code
|
||||
* result = ompi_name_server.compare(bit_mask, &name1, &name2)
|
||||
* @endcode
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_compare_fields_fn_t)(orte_ns_cmp_bitmask_t fields,
|
||||
const orte_process_name_t* name1,
|
||||
@ -451,9 +562,9 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
|
||||
* request that all peers for the parent job be returned, for example.
|
||||
* More common options would be to specify a cell or job.
|
||||
*
|
||||
* NOTE ORTE_JOBID_WILDCARD
|
||||
* NOTE The combination of ORTE_CELLID_WILDCARD and ORTE_JOBID_WILDCARD
|
||||
* in the attribute list will cause the function to return the names of *all*
|
||||
* processes currently active.
|
||||
* processes currently active in the universe.
|
||||
*
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
|
||||
@ -464,6 +575,8 @@ typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
|
||||
/*
|
||||
* DIAGNOSTIC INTERFACES
|
||||
*/
|
||||
typedef int (*orte_ns_base_module_dump_cells_fn_t)(void);
|
||||
|
||||
typedef int (*orte_ns_base_module_dump_jobs_fn_t)(void);
|
||||
|
||||
typedef int (*orte_ns_base_module_dump_tags_fn_t)(void);
|
||||
@ -478,6 +591,12 @@ typedef int (*orte_ns_base_module_ft_event_fn_t)(int state);
|
||||
struct mca_ns_base_module_2_0_0_t {
|
||||
/* init */
|
||||
orte_ns_base_module_init_fn_t init;
|
||||
/* cell functions */
|
||||
orte_ns_base_module_create_cellid_fn_t create_cellid;
|
||||
orte_ns_base_module_get_cell_info_fn_t get_cell_info;
|
||||
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
|
||||
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
|
||||
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
|
||||
/** node functions */
|
||||
orte_ns_base_module_create_nodeids_fn_t create_nodeids;
|
||||
orte_ns_base_module_get_node_info_fn_t get_node_info;
|
||||
@ -512,6 +631,7 @@ struct mca_ns_base_module_2_0_0_t {
|
||||
/* data type functions */
|
||||
orte_ns_base_module_define_data_type_fn_t define_data_type;
|
||||
/* diagnostic functions */
|
||||
orte_ns_base_module_dump_cells_fn_t dump_cells;
|
||||
orte_ns_base_module_dump_jobs_fn_t dump_jobs;
|
||||
orte_ns_base_module_dump_tags_fn_t dump_tags;
|
||||
orte_ns_base_module_dump_datatypes_fn_t dump_datatypes;
|
||||
|
@ -50,6 +50,7 @@ extern "C" {
|
||||
/**** NS ATTRIBUTES ****/
|
||||
#define ORTE_NS_USE_PARENT "orte-ns-use-parent"
|
||||
#define ORTE_NS_USE_ROOT "orte-ns-use-root"
|
||||
#define ORTE_NS_USE_CELL "orte-ns-use-cell"
|
||||
#define ORTE_NS_USE_JOBID "orte-ns-use-job"
|
||||
#define ORTE_NS_USE_NODE "orte-ns-use-node"
|
||||
#define ORTE_NS_INCLUDE_DESCENDANTS "orte-ns-include-desc"
|
||||
@ -58,6 +59,7 @@ extern "C" {
|
||||
|
||||
|
||||
#define ORTE_NAME_ARGS(n) \
|
||||
(long) ((NULL == n) ? (long)-1 : (long)(n)->cellid), \
|
||||
(long) ((NULL == n) ? (long)-1 : (long)(n)->jobid), \
|
||||
(long) ((NULL == n) ? (long)-1 : (long)(n)->vpid)
|
||||
|
||||
@ -67,6 +69,7 @@ extern "C" {
|
||||
*/
|
||||
|
||||
#define ORTE_NS_CMP_NONE 0x00
|
||||
#define ORTE_NS_CMP_CELLID 0x01
|
||||
#define ORTE_NS_CMP_JOBID 0x02
|
||||
#define ORTE_NS_CMP_VPID 0x04
|
||||
#define ORTE_NS_CMP_ALL 0Xff
|
||||
@ -83,26 +86,23 @@ extern "C" {
|
||||
* ns_private.h
|
||||
*/
|
||||
typedef orte_std_cntr_t orte_jobid_t;
|
||||
typedef orte_std_cntr_t orte_cellid_t;
|
||||
typedef orte_std_cntr_t orte_nodeid_t;
|
||||
typedef orte_std_cntr_t orte_vpid_t;
|
||||
|
||||
typedef uint8_t orte_ns_cmp_bitmask_t; /**< Bit mask for comparing process names */
|
||||
|
||||
struct orte_process_name_t {
|
||||
orte_cellid_t cellid; /**< Cell number */
|
||||
orte_jobid_t jobid; /**< Job number */
|
||||
orte_vpid_t vpid; /**< Process number */
|
||||
};
|
||||
typedef struct orte_process_name_t orte_process_name_t;
|
||||
|
||||
|
||||
/* useful define to print name args in output messages */
|
||||
ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *name);
|
||||
#define ORTE_NAME_PRINT(n) \
|
||||
orte_ns_base_print_name_args(n)
|
||||
|
||||
/*
|
||||
* define maximum value for id's in any field
|
||||
*/
|
||||
#define ORTE_CELLID_MAX ORTE_STD_CNTR_MAX
|
||||
#define ORTE_JOBID_MAX ORTE_STD_CNTR_MAX
|
||||
#define ORTE_VPID_MAX ORTE_STD_CNTR_MAX
|
||||
#define ORTE_NODEID_MAX ORTE_STD_CNTR_MAX
|
||||
@ -110,6 +110,7 @@ ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *nam
|
||||
/*
|
||||
* define minimum value for id's in any field
|
||||
*/
|
||||
#define ORTE_CELLID_MIN ORTE_STD_CNTR_MIN
|
||||
#define ORTE_JOBID_MIN ORTE_STD_CNTR_MIN
|
||||
#define ORTE_VPID_MIN ORTE_STD_CNTR_MIN
|
||||
#define ORTE_NODEID_MIN ORTE_STD_CNTR_MIN
|
||||
@ -117,6 +118,7 @@ ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *nam
|
||||
/*
|
||||
* define invalid values
|
||||
*/
|
||||
#define ORTE_CELLID_INVALID (ORTE_CELLID_MIN + 1)
|
||||
#define ORTE_JOBID_INVALID (ORTE_JOBID_MIN + 1)
|
||||
#define ORTE_VPID_INVALID (ORTE_VPID_MIN + 1)
|
||||
#define ORTE_NODEID_INVALID (ORTE_NODEID_MIN + 1)
|
||||
@ -124,6 +126,7 @@ ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *nam
|
||||
/*
|
||||
* define wildcard values (should be -1)
|
||||
*/
|
||||
#define ORTE_CELLID_WILDCARD -1
|
||||
#define ORTE_JOBID_WILDCARD -1
|
||||
#define ORTE_VPID_WILDCARD -1
|
||||
#define ORTE_NODEID_WILDCARD -1
|
||||
@ -149,6 +152,7 @@ ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_my_hnp; /** instantiated
|
||||
* @param name
|
||||
*/
|
||||
#define ORTE_PROCESS_NAME_HTON(n) \
|
||||
n.cellid = htonl(n.cellid); \
|
||||
n.jobid = htonl(n.jobid); \
|
||||
n.vpid = htonl(n.vpid);
|
||||
|
||||
@ -158,6 +162,7 @@ ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_my_hnp; /** instantiated
|
||||
* @param name
|
||||
*/
|
||||
#define ORTE_PROCESS_NAME_NTOH(n) \
|
||||
n.cellid = ntohl(n.cellid); \
|
||||
n.jobid = ntohl(n.jobid); \
|
||||
n.vpid = ntohl(n.vpid);
|
||||
|
||||
|
@ -75,6 +75,8 @@ int orte_ns_proxy_finalize(void);
|
||||
typedef struct {
|
||||
size_t max_size, block_size;
|
||||
int debug;
|
||||
orte_cellid_t num_cells;
|
||||
orte_pointer_array_t *cells;
|
||||
orte_pointer_array_t *tags;
|
||||
orte_rml_tag_t num_tags;
|
||||
orte_pointer_array_t *dts;
|
||||
@ -93,9 +95,14 @@ extern orte_ns_proxy_globals_t orte_ns_proxy;
|
||||
/*
|
||||
* proxy function prototypes
|
||||
*/
|
||||
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames);
|
||||
int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource);
|
||||
|
||||
int orte_ns_proxy_get_node_info(char ***nodename, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
|
||||
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, char **site, char **resource);
|
||||
|
||||
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
|
||||
orte_cellid_t cellid, char **nodenames);
|
||||
|
||||
int orte_ns_proxy_get_node_info(char ***nodename, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
|
||||
|
||||
int orte_ns_proxy_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs);
|
||||
|
||||
@ -127,6 +134,8 @@ int orte_ns_proxy_create_my_name(void);
|
||||
/*
|
||||
* Diagnostic functions
|
||||
*/
|
||||
int orte_ns_proxy_dump_cells(void);
|
||||
|
||||
int orte_ns_proxy_dump_jobs(void);
|
||||
|
||||
int orte_ns_proxy_dump_tags(void);
|
||||
|
@ -45,7 +45,186 @@
|
||||
* functions
|
||||
*/
|
||||
|
||||
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames)
|
||||
int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_ns_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
command = ORTE_NS_CREATE_CELLID_CMD;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, cellid, 1, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &site, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &resource, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
if(answer == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_NS_CREATE_CELLID_CMD != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, cellid, &count, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
return rc;
|
||||
}
|
||||
OBJ_RELEASE(answer);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
|
||||
char **site, char **resource)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
orte_ns_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
command = ORTE_NS_GET_CELL_INFO_CMD;
|
||||
|
||||
cmd = OBJ_NEW(orte_buffer_t);
|
||||
if (cmd == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_RELEASE(cmd);
|
||||
|
||||
answer = OBJ_NEW(orte_buffer_t);
|
||||
if(answer == NULL) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_NS_GET_CELL_INFO_CMD != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_RELEASE(answer);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, site, &count, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, resource, &count, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(answer);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
|
||||
orte_cellid_t cellid, char **nodenames)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
@ -69,6 +248,12 @@ int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnode
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
count = opal_argv_count(nodenames);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &count, 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -138,7 +323,8 @@ int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnode
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_proxy_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
|
||||
int orte_ns_proxy_get_node_info(char ***nodenames, orte_cellid_t cellid,
|
||||
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
|
||||
{
|
||||
orte_buffer_t* cmd;
|
||||
orte_buffer_t* answer;
|
||||
@ -164,6 +350,13 @@ int orte_ns_proxy_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, or
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &num_nodes, 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cmd);
|
||||
|
@ -70,6 +70,12 @@ mca_ns_base_component_t mca_ns_proxy_component = {
|
||||
static mca_ns_base_module_t orte_ns_proxy_module = {
|
||||
/* init */
|
||||
orte_ns_proxy_module_init,
|
||||
/* cell functions */
|
||||
orte_ns_proxy_create_cellid,
|
||||
orte_ns_proxy_get_cell_info,
|
||||
orte_ns_base_get_cellid_string,
|
||||
orte_ns_base_convert_cellid_to_string,
|
||||
orte_ns_base_convert_string_to_cellid,
|
||||
/** node functions */
|
||||
orte_ns_proxy_create_nodeids,
|
||||
orte_ns_proxy_get_node_info,
|
||||
@ -104,6 +110,7 @@ static mca_ns_base_module_t orte_ns_proxy_module = {
|
||||
/* data type functions */
|
||||
orte_ns_proxy_define_data_type,
|
||||
/* diagnostic functions */
|
||||
orte_ns_proxy_dump_cells,
|
||||
orte_ns_proxy_dump_jobs,
|
||||
orte_ns_proxy_dump_tags,
|
||||
orte_ns_proxy_dump_datatypes,
|
||||
@ -224,6 +231,17 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* initialize the cell info tracker */
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells),
|
||||
(orte_std_cntr_t)orte_ns_proxy.block_size,
|
||||
(orte_std_cntr_t)orte_ns_proxy.max_size,
|
||||
(orte_std_cntr_t)orte_ns_proxy.block_size))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
}
|
||||
orte_ns_proxy.num_cells = 0;
|
||||
|
||||
|
||||
/* initialize the taglist */
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.tags),
|
||||
|
@ -40,6 +40,66 @@
|
||||
/*
|
||||
* DIAGNOSTIC functions
|
||||
*/
|
||||
int orte_ns_proxy_dump_cells(void)
|
||||
{
|
||||
orte_buffer_t cmd;
|
||||
orte_buffer_t answer;
|
||||
orte_ns_cmd_flag_t command;
|
||||
orte_std_cntr_t count;
|
||||
int rc;
|
||||
|
||||
command = ORTE_NS_DUMP_CELLS_CMD;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
|
||||
|
||||
/* dump name service replica cell tracker */
|
||||
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
|
||||
OBJ_CONSTRUCT(&answer, orte_buffer_t);
|
||||
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&answer);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_NS_DUMP_CELLS_CMD != command) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
OBJ_DESTRUCT(&answer);
|
||||
return ORTE_ERR_COMM_FAILURE;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&answer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_proxy_dump_jobs(void)
|
||||
{
|
||||
orte_buffer_t cmd;
|
||||
@ -160,8 +220,8 @@ int orte_ns_proxy_dump_tags(void)
|
||||
}
|
||||
|
||||
/* dump local tag tracker */
|
||||
opal_output(mca_ns_base_output, "\n\n%s Dump of Local Tag Tracker\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
|
||||
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
|
||||
i < (orte_ns_proxy.tags)->size; i++) {
|
||||
@ -235,8 +295,8 @@ int orte_ns_proxy_dump_datatypes(void)
|
||||
}
|
||||
|
||||
/* dump local datatype tracker */
|
||||
opal_output(mca_ns_base_output, "\n\n%s Dump of Local Datatype Tracker\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
|
||||
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
|
||||
i < (orte_ns_proxy.dts)->size; i++) {
|
||||
|
@ -47,6 +47,7 @@ int orte_ns_proxy_get_peers(orte_process_name_t **procs,
|
||||
orte_buffer_t* answer;
|
||||
orte_ns_cmd_flag_t command;
|
||||
orte_std_cntr_t count, nprocs, i;
|
||||
orte_cellid_t *cptr;
|
||||
orte_attribute_t *attr;
|
||||
int rc;
|
||||
|
||||
@ -58,12 +59,35 @@ int orte_ns_proxy_get_peers(orte_process_name_t **procs,
|
||||
*procs = NULL;
|
||||
*num_procs = 0;
|
||||
|
||||
/* check the attributes to see if USE_JOB has been set. If not, then this is
|
||||
/* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is
|
||||
* a request for my own job peers - process that one locally
|
||||
*/
|
||||
|
||||
/* if the cell is given AND it matches my own, then we can process this
|
||||
* quickly. Otherwise, we have to do some more work.
|
||||
*
|
||||
* RHC: when we go multi-cell, we need a way to find all the cells upon
|
||||
* which a job is executing so we can make this work!
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return rc;
|
||||
}
|
||||
if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) {
|
||||
/* get my own job peers */
|
||||
/* get my own job peers, assuming all are on this cell - process here
|
||||
*
|
||||
* RHC: This is a bad assumption. When we go multi-cell, we are going to have to process
|
||||
* get peer requests solely on the HNP since we won't know the cellid otherwise
|
||||
*/
|
||||
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t));
|
||||
if (NULL == *procs) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
@ -72,6 +96,7 @@ int orte_ns_proxy_get_peers(orte_process_name_t **procs,
|
||||
}
|
||||
|
||||
for (i=0; i < orte_process_info.num_procs; i++) {
|
||||
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
(*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
(*procs)[i].vpid = orte_process_info.vpid_start + i;
|
||||
}
|
||||
|
@ -0,0 +1,452 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/** @file:
|
||||
*
|
||||
*/
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/trace.h"
|
||||
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/ns/base/ns_private.h"
|
||||
#include "ns_replica.h"
|
||||
|
||||
/**
|
||||
* globals
|
||||
*/
|
||||
#define NS_REPLICA_MAX_STRING_SIZE 256
|
||||
|
||||
/*
|
||||
* DIAGNOSTIC functions
|
||||
*/
|
||||
int orte_ns_replica_dump_cells(void)
|
||||
{
|
||||
orte_buffer_t buffer;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
orte_cellid_t j;
|
||||
orte_ns_replica_cell_tracker_t **cell;
|
||||
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
tmp = tmp_out;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
|
||||
(unsigned long)j, (unsigned long)cell[i]->cell);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
|
||||
cell[i]->site, cell[i]->resource);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_replica_dump_jobs(void)
|
||||
{
|
||||
orte_buffer_t buffer;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
orte_cellid_t j;
|
||||
orte_ns_replica_jobid_tracker_t **ptr;
|
||||
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
tmp = tmp_out;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Jobid Tracker\n");
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_jobids &&
|
||||
i < (orte_ns_replica.jobids)->size; i++) {
|
||||
if (NULL != ptr[i]) {
|
||||
j++;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tJobid: %lu\tNext vpid: %lu\n",
|
||||
(unsigned long)j, (unsigned long)ptr[i]->jobid,
|
||||
(unsigned long)ptr[i]->next_vpid);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_replica_dump_tags(void)
|
||||
{
|
||||
orte_buffer_t buffer;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
orte_rml_tag_t j;
|
||||
orte_ns_replica_tagitem_t **ptr;
|
||||
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
tmp = tmp_out;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n");
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_tags &&
|
||||
i < (orte_ns_replica.tags)->size; i++) {
|
||||
if (NULL != ptr[i]) {
|
||||
j++;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n",
|
||||
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_replica_dump_datatypes(void)
|
||||
{
|
||||
orte_buffer_t buffer;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer)
|
||||
{
|
||||
orte_std_cntr_t i, j;
|
||||
orte_ns_replica_dti_t **ptr;
|
||||
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
tmp = tmp_out;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n");
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_dts &&
|
||||
i < (orte_ns_replica.dts)->size; i++) {
|
||||
if (NULL != ptr[i]) {
|
||||
j++;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n",
|
||||
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* TAG SERVER functions
|
||||
*/
|
||||
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
|
||||
char *name)
|
||||
{
|
||||
orte_ns_replica_tagitem_t *tagitem, **tags;
|
||||
orte_std_cntr_t i;
|
||||
orte_rml_tag_t j;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
if (NULL != name) {
|
||||
/* see if this name is already in list - if so, return tag */
|
||||
tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_tags &&
|
||||
i < (orte_ns_replica.tags)->size; i++) {
|
||||
if (NULL != tags[i]) {
|
||||
j++;
|
||||
if (tags[i]->name != NULL &&
|
||||
0 == strcmp(name, tags[i]->name)) { /* found name on list */
|
||||
*tag = tags[i]->tag;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* not in list or not provided, so allocate next tag */
|
||||
*tag = ORTE_RML_TAG_MAX;
|
||||
|
||||
/* check if tag is available - need to do this since the tag type
|
||||
* is probably not going to be a orte_std_cntr_t, so we cannot just rely
|
||||
* on the pointer_array's size limits to protect us. NOTE: need to
|
||||
* reserve ORTE_RML_TAG_MAX as an invalid value, so can't let
|
||||
* num_tags get there
|
||||
*/
|
||||
if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
|
||||
if (NULL == tagitem) { /* out of memory */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
|
||||
orte_ns_replica.tags, tagitem))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
tagitem->tag = orte_ns_replica.num_tags + ORTE_RML_TAG_DYNAMIC;
|
||||
(orte_ns_replica.num_tags)++;
|
||||
if (NULL != name) { /* provided - can look it up later */
|
||||
tagitem->name = strdup(name);
|
||||
} else {
|
||||
tagitem->name = NULL;
|
||||
}
|
||||
|
||||
*tag = tagitem->tag;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* DATA TYPE SERVER functions
|
||||
*/
|
||||
int orte_ns_replica_define_data_type(const char *name,
|
||||
orte_data_type_t *type)
|
||||
{
|
||||
orte_ns_replica_dti_t **dti, *dtip;
|
||||
orte_std_cntr_t i, j;
|
||||
int rc;
|
||||
|
||||
if (NULL == name || 0 < *type) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_dts &&
|
||||
i < orte_ns_replica.dts->size; i++) {
|
||||
if (NULL != dti[i]) {
|
||||
j++;
|
||||
if (dti[i]->name != NULL &&
|
||||
0 == strcmp(name, dti[i]->name)) { /* found name on list */
|
||||
*type = dti[i]->id;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* not in list or not provided, so allocate next id */
|
||||
*type = ORTE_DSS_ID_MAX;
|
||||
|
||||
/* check if id is available - need to do this since the data type
|
||||
* is probably not going to be a orte_std_cntr_t, so we cannot just rely
|
||||
* on the pointer_array's size limits to protect us.
|
||||
*/
|
||||
if (ORTE_DSS_ID_MAX-2 < orte_ns_replica.num_dts) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
dtip = OBJ_NEW(orte_ns_replica_dti_t);
|
||||
if (NULL == dtip) { /* out of memory */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
dtip->name = strdup(name);
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
|
||||
orte_ns_replica.dts, dtip))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
dtip->id = orte_ns_replica.num_dts;
|
||||
(orte_ns_replica.num_dts)++;
|
||||
|
||||
*type = dtip->id;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* NAME functions
|
||||
*/
|
||||
int orte_ns_replica_create_my_name(void)
|
||||
{
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
||||
0, jobid, vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -39,6 +39,31 @@ extern "C" {
|
||||
*/
|
||||
#define NS_REPLICA_MAX_STRING_SIZE 256
|
||||
|
||||
|
||||
/* class for tracking cellid's */
|
||||
struct orte_ns_replica_cell_tracker_t {
|
||||
opal_object_t super;
|
||||
orte_cellid_t cell;
|
||||
char *site;
|
||||
char *resource;
|
||||
orte_nodeid_t next_nodeid;
|
||||
orte_pointer_array_t *nodeids;
|
||||
};
|
||||
typedef struct orte_ns_replica_cell_tracker_t orte_ns_replica_cell_tracker_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t);
|
||||
|
||||
/* object for tracking nodeid's */
|
||||
struct orte_ns_replica_nodeid_tracker_t {
|
||||
opal_object_t super;
|
||||
char *nodename;
|
||||
orte_nodeid_t nodeid;
|
||||
};
|
||||
typedef struct orte_ns_replica_nodeid_tracker_t orte_ns_replica_nodeid_tracker_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(orte_ns_replica_nodeid_tracker_t);
|
||||
|
||||
|
||||
/*
|
||||
* object for tracking vpids and jobids for job families
|
||||
* This structure is used to track the parent-child relationship between
|
||||
@ -81,8 +106,8 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_dti_t);
|
||||
*/
|
||||
typedef struct {
|
||||
size_t max_size, block_size;
|
||||
orte_nodeid_t next_nodeid;
|
||||
orte_pointer_array_t *nodenames;
|
||||
orte_cellid_t num_cells;
|
||||
orte_pointer_array_t *cells;
|
||||
orte_jobid_t num_jobids;
|
||||
opal_list_t jobs;
|
||||
orte_pointer_array_t *tags;
|
||||
@ -118,11 +143,17 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata);
|
||||
|
||||
/*
|
||||
* NODE FUNCTIONS
|
||||
* CELL FUNCTIONS
|
||||
*/
|
||||
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames);
|
||||
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource);
|
||||
|
||||
int orte_ns_replica_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
|
||||
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
|
||||
char **site, char **resource);
|
||||
|
||||
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
|
||||
orte_cellid_t cellid, char **nodenames);
|
||||
|
||||
int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
|
||||
|
||||
/*
|
||||
* JOB FUNCTIONS
|
||||
@ -164,6 +195,9 @@ int orte_ns_replica_create_my_name(void);
|
||||
/*
|
||||
* DIAGNOSTIC FUNCTIONS
|
||||
*/
|
||||
int orte_ns_replica_dump_cells(void);
|
||||
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer);
|
||||
|
||||
int orte_ns_replica_dump_jobs(void);
|
||||
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer);
|
||||
|
||||
|
@ -38,14 +38,141 @@
|
||||
* functions
|
||||
*/
|
||||
|
||||
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
|
||||
{
|
||||
orte_ns_replica_cell_tracker_t *new_cell, **cell;
|
||||
int rc;
|
||||
orte_std_cntr_t i, j, index;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
/* if a valid cellid is given to us, then all we need to do is
|
||||
* update the descriptive info
|
||||
*/
|
||||
if (ORTE_CELLID_INVALID != *cellid) {
|
||||
/* see if the cell info is already present */
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
if (cell[i]->cell == *cellid) {
|
||||
/* it is here - update the info */
|
||||
if (NULL != cell[i]->site) {
|
||||
free(cell[i]->site);
|
||||
}
|
||||
if (NULL != cell[i]->resource) {
|
||||
free(cell[i]->resource);
|
||||
}
|
||||
new_cell = cell[i];
|
||||
goto UPDATE;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* get here if one isn't already present - create one */
|
||||
goto NEWSITE;
|
||||
}
|
||||
|
||||
/* check for error */
|
||||
if (NULL == site || NULL == resource) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
/* is this a known cellid? */
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
if (0 == strcmp(site, cell[i]->site) &&
|
||||
0 == strcmp(resource, cell[i]->resource)) {
|
||||
*cellid = cell[i]->cell;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*cellid = orte_ns_replica.num_cells;
|
||||
|
||||
NEWSITE:
|
||||
/* new cell - check if cellid is available */
|
||||
if (ORTE_CELLID_MAX-1 < orte_ns_replica.num_cells) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
|
||||
if (NULL == new_cell) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, orte_ns_replica.cells, new_cell))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
(orte_ns_replica.num_cells)++;
|
||||
|
||||
new_cell->cell = *cellid;
|
||||
|
||||
UPDATE:
|
||||
new_cell->site = strdup(site);
|
||||
new_cell->resource = strdup(resource);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
|
||||
char **site, char **resource)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
orte_cellid_t j;
|
||||
orte_ns_replica_cell_tracker_t **cell;
|
||||
|
||||
OPAL_TRACE(1);
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
if (cellid == cell[i]->cell) {
|
||||
*site = strdup(cell[i]->site);
|
||||
*resource = strdup(cell[i]->resource);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* it isn't an error to not find the cell - so do NOT
|
||||
* report it via ORTE_ERROR_LOG
|
||||
*/
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/*
|
||||
* NODEID
|
||||
*/
|
||||
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames)
|
||||
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
|
||||
orte_cellid_t cellid, char **nodenames)
|
||||
{
|
||||
orte_nodeid_t *nds, nid, m;
|
||||
orte_std_cntr_t k, n, num_nodes;
|
||||
char **nodes;
|
||||
orte_ns_replica_cell_tracker_t **cell, *cptr;
|
||||
orte_ns_replica_nodeid_tracker_t **nodes, *node;
|
||||
orte_nodeid_t *nds, nid;
|
||||
orte_std_cntr_t i, j, k, m, n, num_nodes;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
@ -62,20 +189,54 @@ int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nno
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
nodes = (char**)(orte_ns_replica.nodenames->addr);
|
||||
/** find the cell */
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
if (cellid == cell[i]->cell) {
|
||||
/** found the specified cell - check to see if nodename has already been
|
||||
* defined. if so, just return the nodeid. if not, create a new one
|
||||
*/
|
||||
cptr = cell[i];
|
||||
goto PROCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
/** get here if we didn't find the cell */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
free(nds);
|
||||
*nodeids = NULL;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
|
||||
PROCESS:
|
||||
nodes = (orte_ns_replica_nodeid_tracker_t**)(cptr->nodeids->addr);
|
||||
for (n=0; n < num_nodes; n++) {
|
||||
for (k=0, m=0; m < orte_ns_replica.next_nodeid &&
|
||||
k < (orte_ns_replica.nodenames)->size; k++) {
|
||||
for (k=0, m=0; m < cptr->next_nodeid &&
|
||||
k < (cptr->nodeids)->size; k++) {
|
||||
if (NULL != nodes[k]) {
|
||||
m++;
|
||||
if (strcmp(nodenames[n], nodes[k]) == 0) { /** found same name */
|
||||
nid = m;
|
||||
if (strcmp(nodenames[n], nodes[k]->nodename) == 0) { /** found same name */
|
||||
nid = nodes[k]->nodeid;
|
||||
goto ASSIGN;
|
||||
}
|
||||
}
|
||||
}
|
||||
/** get here if we don't find this nodename - add it */
|
||||
nid = orte_ns_replica.next_nodeid++;
|
||||
/** get here if we don't find this nodename - add one */
|
||||
node = OBJ_NEW(orte_ns_replica_nodeid_tracker_t);
|
||||
if (NULL == node) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(nds);
|
||||
*nodeids = NULL;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
node->nodename = strdup(nodenames[n]);
|
||||
node->nodeid = cptr->next_nodeid;
|
||||
cptr->next_nodeid++;
|
||||
nid = node->nodeid;
|
||||
|
||||
ASSIGN:
|
||||
nds[n] = nid;
|
||||
@ -88,13 +249,16 @@ ASSIGN:
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_replica_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
|
||||
int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid,
|
||||
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
|
||||
{
|
||||
char **names;
|
||||
orte_std_cntr_t n;
|
||||
char **nodes;
|
||||
char **names, *nm;
|
||||
orte_ns_replica_cell_tracker_t **cell, *cptr;
|
||||
orte_ns_replica_nodeid_tracker_t **nodes;
|
||||
orte_std_cntr_t i, j, k, m, n;
|
||||
char *err_name = "NODE_NOT_FOUND";
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
if (0 == num_nodes) {
|
||||
*nodenames = NULL;
|
||||
@ -109,16 +273,49 @@ int orte_ns_replica_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes,
|
||||
}
|
||||
names[num_nodes] = NULL; /** NULL-terminate the list */
|
||||
|
||||
nodes = (char**)(orte_ns_replica.nodenames->addr);
|
||||
for (n=0; n < num_nodes; n++) {
|
||||
if (nodeids[n] >= orte_ns_replica.next_nodeid) {
|
||||
names[n] = strdup("invalid nodeid");
|
||||
} else if (NULL != nodes[nodeids[n]]) {
|
||||
names[n] = strdup(nodes[nodeids[n]]);
|
||||
} else {
|
||||
names[n] = strdup("unknown nodeid");
|
||||
/** find the cell */
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
if (cellid == cell[i]->cell) {
|
||||
/** found the specified cell - check to see if nodename has already been
|
||||
* defined. if so, just return the nodeid. if not, create a new one
|
||||
*/
|
||||
cptr = cell[i];
|
||||
goto PROCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
/** get here if we didn't find the cell */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
free(names);
|
||||
*nodenames = NULL;
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
|
||||
PROCESS:
|
||||
nodes = (orte_ns_replica_nodeid_tracker_t**)(cell[i]->nodeids->addr);
|
||||
for (n=0; n < num_nodes; n++) {
|
||||
for (k=0, m=0; m < cell[i]->next_nodeid &&
|
||||
k < (cell[i]->nodeids)->size; k++) {
|
||||
if (NULL != nodes[k]) {
|
||||
m++;
|
||||
if (nodeids[n] == nodes[k]->nodeid) { /** found it */
|
||||
nm = nodes[k]->nodename;
|
||||
goto ASSIGN;
|
||||
}
|
||||
}
|
||||
}
|
||||
/** node not found - set name to error name. Can't set it to NULL since
|
||||
* the list is a NULL-terminated one
|
||||
*/
|
||||
nm = err_name;
|
||||
|
||||
ASSIGN:
|
||||
names[n] = strdup(nm);
|
||||
}
|
||||
|
||||
*nodenames = names;
|
||||
|
||||
|
@ -34,6 +34,67 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*** CELLID ***/
|
||||
/* constructor - used to initialize state of cell_tracker instance */
|
||||
static void orte_ns_replica_cell_tracker_construct(orte_ns_replica_cell_tracker_t* cell_tracker)
|
||||
{
|
||||
cell_tracker->cell = ORTE_CELLID_INVALID;
|
||||
cell_tracker->site = NULL;
|
||||
cell_tracker->resource = NULL;
|
||||
|
||||
cell_tracker->next_nodeid = 0;
|
||||
orte_pointer_array_init(&(cell_tracker->nodeids),
|
||||
orte_ns_replica.block_size,
|
||||
orte_ns_replica.max_size,
|
||||
orte_ns_replica.block_size);
|
||||
}
|
||||
|
||||
/* destructor - used to free any resources held by instance */
|
||||
static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker_t* cell_tracker)
|
||||
{
|
||||
orte_std_cntr_t i, j;
|
||||
orte_ns_replica_nodeid_tracker_t **nodeid;
|
||||
|
||||
if (NULL != cell_tracker->site) free(cell_tracker->site);
|
||||
if (NULL != cell_tracker->resource) free(cell_tracker->resource);
|
||||
|
||||
nodeid = (orte_ns_replica_nodeid_tracker_t**)(cell_tracker->nodeids)->addr;
|
||||
|
||||
for (i=0, j=0; j < cell_tracker->next_nodeid &&
|
||||
i < (cell_tracker->nodeids)->size; i++) {
|
||||
if (NULL != nodeid[i]) {
|
||||
j++;
|
||||
OBJ_RELEASE(nodeid[i]);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(cell_tracker->nodeids);
|
||||
}
|
||||
|
||||
/* define instance of opal_class_t */
|
||||
OBJ_CLASS_INSTANCE(orte_ns_replica_cell_tracker_t, /* type name */
|
||||
opal_object_t, /* parent "class" name */
|
||||
orte_ns_replica_cell_tracker_construct, /* constructor */
|
||||
orte_ns_replica_cell_tracker_destructor); /* destructor */
|
||||
|
||||
|
||||
/** NODEID */
|
||||
static void orte_ns_replica_nodeid_tracker_construct(orte_ns_replica_nodeid_tracker_t *ptr)
|
||||
{
|
||||
ptr->nodeid = ORTE_NODEID_INVALID;
|
||||
ptr->nodename = NULL;
|
||||
}
|
||||
|
||||
static void orte_ns_replica_nodeid_tracker_destructor(orte_ns_replica_nodeid_tracker_t *ptr)
|
||||
{
|
||||
if (NULL != ptr->nodename) free(ptr->nodename);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_ns_replica_nodeid_tracker_t, /* type name */
|
||||
opal_object_t, /* parent "class" name */
|
||||
orte_ns_replica_nodeid_tracker_construct, /* constructor */
|
||||
orte_ns_replica_nodeid_tracker_destructor); /* destructor */
|
||||
|
||||
|
||||
/*** JOBITEM ***/
|
||||
/* constructor - used to initialize state of jobitem instance */
|
||||
static void orte_ns_replica_jobitem_construct(orte_ns_replica_jobitem_t *ptr)
|
||||
|
@ -77,6 +77,12 @@ orte_ns_replica_finalize /* module shutdown */
|
||||
static mca_ns_base_module_t orte_ns_replica_module = {
|
||||
/* init */
|
||||
orte_ns_replica_module_init,
|
||||
/* cell functions */
|
||||
orte_ns_replica_create_cellid,
|
||||
orte_ns_replica_get_cell_info,
|
||||
orte_ns_base_get_cellid_string,
|
||||
orte_ns_base_convert_cellid_to_string,
|
||||
orte_ns_base_convert_string_to_cellid,
|
||||
/** node functions */
|
||||
orte_ns_replica_create_nodeids,
|
||||
orte_ns_replica_get_node_info,
|
||||
@ -111,6 +117,7 @@ static mca_ns_base_module_t orte_ns_replica_module = {
|
||||
/* data type functions */
|
||||
orte_ns_replica_define_data_type,
|
||||
/* diagnostic functions */
|
||||
orte_ns_replica_dump_cells,
|
||||
orte_ns_replica_dump_jobs,
|
||||
orte_ns_replica_dump_tags,
|
||||
orte_ns_replica_dump_datatypes,
|
||||
@ -188,15 +195,15 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
|
||||
|
||||
*priority = 50;
|
||||
|
||||
/* initialize the node tracker */
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.nodenames),
|
||||
/* initialize the cell info tracker */
|
||||
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells),
|
||||
(orte_std_cntr_t)orte_ns_replica.block_size,
|
||||
(orte_std_cntr_t)orte_ns_replica.max_size,
|
||||
(orte_std_cntr_t)orte_ns_replica.block_size))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
}
|
||||
orte_ns_replica.next_nodeid = 0;
|
||||
orte_ns_replica.num_cells = 0;
|
||||
|
||||
/* initialize the job tracking system */
|
||||
OBJ_CONSTRUCT(&orte_ns_replica.jobs, opal_list_t);
|
||||
@ -258,25 +265,22 @@ int orte_ns_replica_module_init(void)
|
||||
*/
|
||||
int orte_ns_replica_finalize(void)
|
||||
{
|
||||
char **cptr;
|
||||
orte_ns_replica_cell_tracker_t **cptr;
|
||||
opal_list_item_t *item;
|
||||
orte_ns_replica_tagitem_t **tag;
|
||||
orte_ns_replica_dti_t **dti;
|
||||
orte_std_cntr_t i;
|
||||
orte_nodeid_t j;
|
||||
|
||||
/* free all tracking storage, but only if this component was initialized */
|
||||
|
||||
if (initialized) {
|
||||
cptr = (char**)(orte_ns_replica.nodenames)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.next_nodeid &&
|
||||
i < (orte_ns_replica.nodenames)->size; i++) {
|
||||
cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0; i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cptr[i]) {
|
||||
j++;
|
||||
free(cptr[i]);
|
||||
OBJ_RELEASE(cptr[i]);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(orte_ns_replica.nodenames);
|
||||
OBJ_RELEASE(orte_ns_replica.cells);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&orte_ns_replica.jobs))) {
|
||||
OBJ_RELEASE(item);
|
||||
|
@ -37,6 +37,72 @@
|
||||
/*
|
||||
* DIAGNOSTIC functions
|
||||
*/
|
||||
int orte_ns_replica_dump_cells(void)
|
||||
{
|
||||
orte_buffer_t buffer;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
|
||||
{
|
||||
orte_std_cntr_t i;
|
||||
orte_cellid_t j;
|
||||
orte_ns_replica_cell_tracker_t **cell;
|
||||
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
|
||||
int rc;
|
||||
|
||||
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
|
||||
|
||||
tmp = tmp_out;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
|
||||
for (i=0, j=0; j < orte_ns_replica.num_cells &&
|
||||
i < (orte_ns_replica.cells)->size; i++) {
|
||||
if (NULL != cell[i]) {
|
||||
j++;
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
|
||||
(unsigned long)j, (unsigned long)cell[i]->cell);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
|
||||
cell[i]->site, cell[i]->resource);
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ns_replica_dump_jobs(void)
|
||||
{
|
||||
orte_buffer_t buffer;
|
||||
|
@ -41,6 +41,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
{
|
||||
orte_std_cntr_t i, isave, npeers;
|
||||
orte_jobid_t *jptr;
|
||||
orte_cellid_t *cptr;
|
||||
orte_attribute_t *attr;
|
||||
orte_ns_replica_jobitem_t *job_info, *child;
|
||||
opal_list_item_t *item;
|
||||
@ -55,12 +56,31 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
*procs = NULL;
|
||||
*num_procs = 0;
|
||||
|
||||
/* check the attributes to see if USE_JOB has been set. If not, then this is
|
||||
/* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is
|
||||
* a request for my own job peers - process that one locally
|
||||
*/
|
||||
|
||||
/* if the cell is given AND it matches my own, then we can process this
|
||||
* quickly. Otherwise, we have to do some more work.
|
||||
*
|
||||
* RHC: when we go multi-cell, we need a way to find all the cells upon
|
||||
* which a job is executing so we can make this work!
|
||||
*/
|
||||
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return rc;
|
||||
}
|
||||
if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
|
||||
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) {
|
||||
/* get my own job peers */
|
||||
/* get my own job peers, assuming all are on this cell */
|
||||
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t));
|
||||
if (NULL == *procs) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
@ -69,6 +89,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
}
|
||||
|
||||
for (i=0; i < orte_process_info.num_procs; i++) {
|
||||
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
(*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
(*procs)[i].vpid = orte_process_info.vpid_start + i;
|
||||
}
|
||||
@ -130,6 +151,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
while (NULL != (item = opal_list_remove_first(&peerlist))) {
|
||||
child = (orte_ns_replica_jobitem_t*)item;
|
||||
for (i=0; i < child->next_vpid; i++) {
|
||||
(*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
(*procs)[i+isave].jobid = child->jobid;
|
||||
(*procs)[i+isave].vpid = i;
|
||||
}
|
||||
@ -164,6 +186,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
|
||||
/* populate it, starting with the specified job followed by its children */
|
||||
for (i=0; i < job_info->next_vpid; i++) {
|
||||
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
(*procs)[i].jobid = *jptr;
|
||||
(*procs)[i].vpid = i;
|
||||
}
|
||||
@ -173,6 +196,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_ns_replica_jobitem_t*)item;
|
||||
for (i=0; i < child->next_vpid; i++) {
|
||||
(*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
(*procs)[i+isave].jobid = child->jobid;
|
||||
(*procs)[i+isave].vpid = i;
|
||||
}
|
||||
@ -196,6 +220,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
|
||||
}
|
||||
|
||||
for (i=0; i < job_info->next_vpid; i++) {
|
||||
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
(*procs)[i].jobid = *jptr;
|
||||
(*procs)[i].vpid = i;
|
||||
}
|
||||
@ -367,7 +392,8 @@ int orte_ns_replica_create_my_name(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name), jobid, vpid))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
|
||||
0, jobid, vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -58,14 +58,15 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
|
||||
orte_buffer_t answer, error_answer;
|
||||
orte_ns_cmd_flag_t command;
|
||||
opal_list_t attrs;
|
||||
orte_cellid_t cell;
|
||||
orte_jobid_t job, root, *descendants;
|
||||
orte_vpid_t startvpid, range;
|
||||
char *tagname;
|
||||
char *tagname, *site, *resource;
|
||||
orte_rml_tag_t oob_tag;
|
||||
orte_data_type_t type;
|
||||
orte_std_cntr_t count, nprocs, nret;
|
||||
orte_process_name_t *procs;
|
||||
int rc=ORTE_SUCCESS;
|
||||
int rc=ORTE_SUCCESS, ret;
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_NS_CMD))) {
|
||||
@ -81,6 +82,69 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case ORTE_NS_CREATE_CELLID_CMD:
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &site, &count, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &resource, &count, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
rc = orte_ns_replica_create_cellid(&cell, site, resource);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &cell, 1, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_NS_GET_CELL_INFO_CMD:
|
||||
count = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
rc = ORTE_ERR_BAD_PARAM;
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
site = NULL;
|
||||
resource = NULL;
|
||||
rc = orte_ns_replica_get_cell_info(cell, &site, &resource);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &site, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &resource, 1, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_NS_CREATE_NODEID_CMD:
|
||||
case ORTE_NS_GET_NODE_INFO_CMD:
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
|
||||
@ -390,6 +454,17 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_NS_DUMP_CELLS_CMD:
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
goto RETURN_ERROR;
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_NS_DUMP_JOBIDS_CMD:
|
||||
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -677,8 +677,8 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data,)
|
||||
child = (odls_bproc_child_t *) item;
|
||||
if(0 < mca_odls_bproc_component.debug) {
|
||||
opal_output(0, "orte_odls_bproc_launch: setting up io for "
|
||||
"%s proc rank %ld\n",
|
||||
ORTE_NAME_PRINT((child->name)),
|
||||
"[%ld,%ld,%ld] proc rank %ld\n",
|
||||
ORTE_NAME_ARGS((child->name)),
|
||||
(long)child->name->vpid);
|
||||
}
|
||||
/* only setup to forward stdin if it is rank 0, otherwise connect
|
||||
|
@ -530,8 +530,8 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
|
||||
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
|
||||
|
||||
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: working on job %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)job);
|
||||
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job);
|
||||
|
||||
/* since we are going to be working with the global list of
|
||||
* children, we need to protect that list from modification
|
||||
@ -547,8 +547,8 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
/* preserve the pointer to the next item in list in case we release it */
|
||||
next = opal_list_get_next(item);
|
||||
|
||||
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: checking child process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
|
||||
|
||||
/* do we have a child from the specified job? Because the
|
||||
* job could be given as a WILDCARD value, we must use
|
||||
@ -565,8 +565,8 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
* to do to it
|
||||
*/
|
||||
if (!child->alive) {
|
||||
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: child %s is not alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child [%ld,%ld,%ld] is not alive",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
|
||||
/* ensure, though, that the state is terminated so we don't lockup if
|
||||
* the proc never started
|
||||
*/
|
||||
@ -702,8 +702,8 @@ GOTCHILD:
|
||||
exception is detected and handled (in which case this unpublish
|
||||
request will be ignored/discarded. */
|
||||
opal_output(orte_odls_globals.output,
|
||||
"odls: pid %ld corresponds to %s\n",
|
||||
(long) pid, ORTE_NAME_PRINT(child->name));
|
||||
"odls: pid %ld corresponds to [%lu,%lu,%lu]\n",
|
||||
(long) pid, ORTE_NAME_ARGS(child->name));
|
||||
if (0 == child->name->vpid) {
|
||||
rc = orte_iof.iof_unpublish(child->name, ORTE_NS_CMP_ALL,
|
||||
ORTE_IOF_STDIN);
|
||||
@ -751,20 +751,20 @@ GOTCHILD:
|
||||
/* the abort file must exist - there is nothing in it we need. It's
|
||||
* meer existence indicates that an abnormal termination occurred
|
||||
*/
|
||||
opal_output(orte_odls_globals.output, "odls: child %s died by abort",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
aborted = true;
|
||||
free(abort_file);
|
||||
} else {
|
||||
opal_output(orte_odls_globals.output, "odls: child process %s terminated normally",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
}
|
||||
} else {
|
||||
/* the process was terminated with a signal! That's definitely
|
||||
* abnormal, so indicate that condition
|
||||
*/
|
||||
opal_output(orte_odls_globals.output, "odls: child process %s terminated with signal",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
aborted = true;
|
||||
}
|
||||
|
||||
@ -1419,6 +1419,7 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
filem_request->num_procs = 1;
|
||||
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
|
||||
filem_request->proc_name[0].cellid = orte_process_info.gpr_replica->cellid;
|
||||
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
|
||||
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
|
||||
if(app_item->app_context->preload_binary) {
|
||||
@ -1517,8 +1518,8 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
* If it has been launched, then do nothing
|
||||
*/
|
||||
if (child->alive) {
|
||||
opal_output(orte_odls_globals.output, "odls: child %s is already alive",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is already alive",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1527,13 +1528,13 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
* the dss.compare function to check for equality.
|
||||
*/
|
||||
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
||||
opal_output(orte_odls_globals.output, "odls: child %s is not in job %ld being launched",
|
||||
ORTE_NAME_PRINT(child->name), (long)job);
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is not in job %ld being launched",
|
||||
ORTE_NAME_ARGS(child->name), (long)job);
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_output(orte_odls_globals.output, "odls: preparing to launch child %s",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
|
||||
/* find the indicated app_context in the list */
|
||||
for (item2 = opal_list_get_first(&app_context_list);
|
||||
@ -1710,8 +1711,8 @@ int orte_odls_default_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, o
|
||||
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
||||
continue;
|
||||
}
|
||||
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child %s",
|
||||
(unsigned long)tag, ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child [%ld, %ld, %ld]",
|
||||
(unsigned long)tag, ORTE_NAME_ARGS(child->name));
|
||||
|
||||
/* if so, send the message */
|
||||
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);
|
||||
|
@ -248,8 +248,8 @@ static int orte_odls_process_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
|
||||
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
|
||||
|
||||
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: working on job %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)job);
|
||||
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job);
|
||||
|
||||
/* since we are going to be working with the global list of
|
||||
* children, we need to protect that list from modification
|
||||
@ -265,8 +265,8 @@ static int orte_odls_process_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
/* preserve the pointer to the next item in list in case we release it */
|
||||
next = opal_list_get_next(item);
|
||||
|
||||
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: checking child process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
|
||||
|
||||
/* do we have a child from the specified job? Because the
|
||||
* job could be given as a WILDCARD value, we must use
|
||||
@ -283,8 +283,8 @@ static int orte_odls_process_kill_local_procs(orte_jobid_t job, bool set_state)
|
||||
* to do to it
|
||||
*/
|
||||
if (!child->alive) {
|
||||
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: child %s is not alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child [%ld,%ld,%ld] is not alive",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
|
||||
/* ensure, though, that the state is terminated so we don't lockup if
|
||||
* the proc never started
|
||||
*/
|
||||
@ -418,8 +418,8 @@ GOTCHILD:
|
||||
exception is detected and handled (in which case this unpublish
|
||||
request will be ignored/discarded. */
|
||||
opal_output(orte_odls_globals.output,
|
||||
"odls: pid %ld corresponds to %s\n",
|
||||
(long) pid, ORTE_NAME_PRINT(child->name));
|
||||
"odls: pid %ld corresponds to [%lu,%lu,%lu]\n",
|
||||
(long) pid, ORTE_NAME_ARGS(child->name));
|
||||
#if 0
|
||||
if (0 == child->name->vpid) {
|
||||
rc = orte_iof.iof_unpublish(child->name, ORTE_NS_CMP_ALL,
|
||||
@ -461,20 +461,20 @@ GOTCHILD:
|
||||
/* the abort file must exist - there is nothing in it we need. It's
|
||||
* meer existence indicates that an abnormal termination occurred
|
||||
*/
|
||||
opal_output(orte_odls_globals.output, "odls: child %s died by abort",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
aborted = true;
|
||||
free(abort_file);
|
||||
} else {
|
||||
opal_output(orte_odls_globals.output, "odls: child process %s terminated normally",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
}
|
||||
} else {
|
||||
/* the process was terminated with a signal! That's definitely
|
||||
* abnormal, so indicate that condition
|
||||
*/
|
||||
opal_output(orte_odls_globals.output, "odls: child process %s terminated with signal",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
aborted = true;
|
||||
}
|
||||
|
||||
@ -1007,6 +1007,7 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
filem_request = OBJ_NEW(orte_filem_base_request_t);
|
||||
filem_request->num_procs = 1;
|
||||
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
|
||||
filem_request->proc_name[0].cellid = orte_process_info.gpr_replica->cellid;
|
||||
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
|
||||
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
|
||||
if(app_item->app_context->preload_binary) {
|
||||
@ -1105,8 +1106,8 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
* If it has been launched, then do nothing
|
||||
*/
|
||||
if (child->alive) {
|
||||
opal_output(orte_odls_globals.output, "odls: child %s is already alive",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is already alive",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1115,13 +1116,13 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
|
||||
* the dss.compare function to check for equality.
|
||||
*/
|
||||
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
||||
opal_output(orte_odls_globals.output, "odls: child %s is not in job %ld being launched",
|
||||
ORTE_NAME_PRINT(child->name), (long)job);
|
||||
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is not in job %ld being launched",
|
||||
ORTE_NAME_ARGS(child->name), (long)job);
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_output(orte_odls_globals.output, "odls: preparing to launch child %s",
|
||||
ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]",
|
||||
ORTE_NAME_ARGS(child->name));
|
||||
|
||||
/* find the indicated app_context in the list */
|
||||
for (item2 = opal_list_get_first(&app_context_list);
|
||||
@ -1272,8 +1273,8 @@ int orte_odls_process_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, o
|
||||
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
|
||||
continue;
|
||||
}
|
||||
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child %s",
|
||||
(unsigned long)tag, ORTE_NAME_PRINT(child->name));
|
||||
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child [%ld, %ld, %ld]",
|
||||
(unsigned long)tag, ORTE_NAME_ARGS(child->name));
|
||||
|
||||
/* if so, send the message */
|
||||
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);
|
||||
|
@ -147,7 +147,7 @@ int mca_oob_xcast_nb(orte_jobid_t job,
|
||||
DONE:
|
||||
if (orte_timing) {
|
||||
gettimeofday(&stop, NULL);
|
||||
opal_output(0, "xcast_nb %s: time %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
opal_output(0, "xcast_nb [%ld,%ld,%ld]: time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
|
||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||
(stop.tv_usec - start.tv_usec)));
|
||||
}
|
||||
@ -226,7 +226,7 @@ DONE:
|
||||
|
||||
if (orte_timing) {
|
||||
gettimeofday(&stop, NULL);
|
||||
opal_output(0, "xcast %s: time %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
opal_output(0, "xcast [%ld,%ld,%ld]: time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
|
||||
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
|
||||
(stop.tv_usec - start.tv_usec)));
|
||||
}
|
||||
@ -314,11 +314,12 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (orte_timing) {
|
||||
opal_output(0, "xcast %s: mode binomial buffer size %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
|
||||
opal_output(0, "xcast [%ld,%ld,%ld]: mode binomial buffer size %ld",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
|
||||
}
|
||||
|
||||
/* start setting up the target recipients */
|
||||
target.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
target.jobid = 0;
|
||||
|
||||
/* compute the bitmap */
|
||||
@ -358,12 +359,13 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
|
||||
orte_oob_xcast_num_active += binomial_xcast_num_active;
|
||||
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
|
||||
|
||||
target.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
target.jobid = 0;
|
||||
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
|
||||
peer = rank | mask;
|
||||
if (peer < size) {
|
||||
target.vpid = (orte_vpid_t)peer;
|
||||
opal_output(mca_oob_base_output, "%s xcast to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&target));
|
||||
opal_output(mca_oob_base_output, "[%ld,%ld,%ld] xcast to [%ld,%ld,%ld]", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&target));
|
||||
if (0 > (rc = mca_oob_send_packed_nb(&target, buf, ORTE_RML_TAG_ORTED_ROUTED,
|
||||
0, mca_oob_xcast_send_cb, NULL))) {
|
||||
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
|
||||
@ -450,8 +452,8 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (orte_timing) {
|
||||
opal_output(0, "xcast %s: mode linear buffer size %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
|
||||
opal_output(0, "xcast [%ld,%ld,%ld]: mode linear buffer size %ld",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
|
||||
}
|
||||
|
||||
/* get the number of daemons out there */
|
||||
@ -486,6 +488,7 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
|
||||
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
|
||||
|
||||
/* send the message to each daemon as fast as we can */
|
||||
dummy.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
dummy.jobid = 0;
|
||||
for (i=0; i < range; i++) {
|
||||
if (ORTE_PROC_MY_NAME->vpid != i) { /* don't send to myself */
|
||||
@ -541,8 +544,8 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
|
||||
OBJ_DESTRUCT(&attrs);
|
||||
|
||||
if (orte_timing) {
|
||||
opal_output(0, "xcast %s: mode direct buffer size %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buffer->bytes_used);
|
||||
opal_output(0, "xcast [%ld,%ld,%ld]: mode direct buffer size %ld",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buffer->bytes_used);
|
||||
}
|
||||
|
||||
/* we have to account for all of the messages we are about to send
|
||||
@ -555,7 +558,7 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
|
||||
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
|
||||
|
||||
for(i=0; i<n; i++) {
|
||||
opal_output(mca_oob_base_output, "oob_xcast: sending to %s", ORTE_NAME_PRINT(peers+i));
|
||||
opal_output(mca_oob_base_output, "oob_xcast: sending to [%ld,%ld,%ld]", ORTE_NAME_ARGS(peers+i));
|
||||
if (0 > (rc = mca_oob_send_packed_nb(peers+i, buffer, tag, 0, mca_oob_xcast_send_cb, NULL))) {
|
||||
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
|
||||
|
@ -460,8 +460,8 @@ static void mca_oob_tcp_accept(int incoming_sd)
|
||||
|
||||
/* log the accept */
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s mca_oob_tcp_accept: %s:%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_accept: %s:%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
opal_net_get_hostname((struct sockaddr*) &addr),
|
||||
opal_net_get_port((struct sockaddr*) &addr));
|
||||
}
|
||||
@ -672,8 +672,8 @@ static void* mca_oob_tcp_listen_thread(opal_object_t *obj)
|
||||
}
|
||||
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
item->fd, opal_socket_errno,
|
||||
inet_ntoa(item->addr.sin_addr),
|
||||
item->addr.sin_port);
|
||||
@ -743,8 +743,8 @@ static int mca_oob_tcp_listen_progress(void)
|
||||
|
||||
/* log the accept */
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s mca_oob_tcp_listen_progress: %s:%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_progress: %s:%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
inet_ntoa(item->addr.sin_addr),
|
||||
item->addr.sin_port);
|
||||
}
|
||||
@ -851,9 +851,9 @@ static void mca_oob_tcp_recv_probe(int sd, mca_oob_tcp_hdr_t* hdr)
|
||||
int retval = send(sd, (char *)ptr+cnt, sizeof(mca_oob_tcp_hdr_t)-cnt, 0);
|
||||
if(retval < 0) {
|
||||
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_probe: send() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(hdr->msg_src)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_probe: send() failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(hdr->msg_src)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
@ -877,13 +877,13 @@ static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr)
|
||||
|
||||
/* now set socket up to be non-blocking */
|
||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: fcntl(F_GETFL) failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: fcntl(F_GETFL) failed: %s (%d)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: fcntl(F_SETFL) failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: fcntl(F_SETFL) failed: %s (%d)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
||||
}
|
||||
}
|
||||
|
||||
@ -898,24 +898,25 @@ static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr)
|
||||
if (ORTE_SUCCESS != orte_ns.reserve_range(hdr->msg_src.jobid, 1, &hdr->msg_src.vpid)) {
|
||||
return;
|
||||
}
|
||||
hdr->msg_src.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
}
|
||||
|
||||
/* lookup the corresponding process */
|
||||
peer = mca_oob_tcp_peer_lookup(&hdr->msg_src);
|
||||
if(NULL == peer) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: unable to locate peer",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: unable to locate peer",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
/* is the peer instance willing to accept this connection */
|
||||
if(mca_oob_tcp_peer_accept(peer, sd) == false) {
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_recv_handler: "
|
||||
"rejected connection from %s connection state %d",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_PRINT(&(hdr->msg_src)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_recv_handler: "
|
||||
"rejected connection from [%lu,%lu,%lu] connection state %d",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(&(hdr->msg_src)),
|
||||
peer->peer_state);
|
||||
}
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
@ -954,15 +955,15 @@ static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
|
||||
while((rc = recv(sd, (char *)&hdr, sizeof(hdr), 0)) != sizeof(hdr)) {
|
||||
if(rc >= 0) {
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: peer closed connection",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: peer closed connection",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
if(opal_socket_errno != EINTR) {
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
return;
|
||||
}
|
||||
@ -978,8 +979,8 @@ static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
|
||||
mca_oob_tcp_recv_connect(sd, &hdr);
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "%s mca_oob_tcp_recv_handler: invalid message type: %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), hdr.msg_type);
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: invalid message type: %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), hdr.msg_type);
|
||||
CLOSE_THE_SOCKET(sd);
|
||||
break;
|
||||
}
|
||||
@ -1079,8 +1080,8 @@ void mca_oob_tcp_registry_callback(
|
||||
mca_oob_tcp_peer_t* peer;
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s mca_oob_tcp_registry_callback\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
/* process the callback */
|
||||
@ -1115,15 +1116,15 @@ void mca_oob_tcp_registry_callback(
|
||||
addr = mca_oob_tcp_addr_unpack(&buffer);
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
if(NULL == addr) {
|
||||
opal_output(0, "%s mca_oob_tcp_registry_callback: unable to unpack peer address\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback: unable to unpack peer address\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
continue;
|
||||
}
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug > OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s mca_oob_tcp_registry_callback: received peer %s\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(addr->addr_name)));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback: received peer [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(addr->addr_name)));
|
||||
}
|
||||
|
||||
/* check for existing cache entry */
|
||||
@ -1132,8 +1133,8 @@ void mca_oob_tcp_registry_callback(
|
||||
if(NULL != existing && ORTE_EQUAL != orte_dss.compare(ORTE_PROC_MY_NAME, &addr->addr_name, ORTE_NAME)) {
|
||||
/* need to update existing entry - but don't update our own entry! */
|
||||
if(mca_oob_tcp_component.tcp_debug > OOB_TCP_DEBUG_INFO) {
|
||||
opal_output( 0, "%s Received OOB update for %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&addr->addr_name) );
|
||||
opal_output( 0, "[%ld,%ld,%ld] Received OOB update for [%ld,%ld,%ld]",
|
||||
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&addr->addr_name) );
|
||||
}
|
||||
orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peer_names, &addr->addr_name, addr);
|
||||
OBJ_RELEASE(addr);
|
||||
@ -1216,8 +1217,8 @@ int mca_oob_tcp_init(void)
|
||||
16); /* increment to grow by */
|
||||
opal_progress_register(mca_oob_tcp_listen_progress);
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s accepting connections via listen thread",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] accepting connections via listen thread",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
} else {
|
||||
/* fix up the listen_type, since we might have been in thread,
|
||||
@ -1245,8 +1246,8 @@ int mca_oob_tcp_init(void)
|
||||
}
|
||||
#endif
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s accepting connections via event library",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] accepting connections via event library",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1269,8 +1270,8 @@ int mca_oob_tcp_register_subscription(orte_jobid_t jobid, char *trigger)
|
||||
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
|
||||
opal_output(0, "%s mca_oob_tcp_init: calling orte_gpr.subscribe\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_init: calling orte_gpr.subscribe\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name));
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&sub_name,
|
||||
@ -1367,9 +1368,9 @@ int mca_oob_tcp_register_contact_info(void)
|
||||
tmp2 = strrchr(tmp, '/') + 1;
|
||||
tmp3 = strrchr(tmp, ':');
|
||||
if(NULL == tmp2 || NULL == tmp3) {
|
||||
opal_output(0, "%s mca_oob_tcp_init: invalid address \'%s\' "
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_init: invalid address \'%s\' "
|
||||
"returned for selected oob interfaces.\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), tmp);
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), tmp);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
free(tmp);
|
||||
free(bo.bytes);
|
||||
|
@ -257,9 +257,9 @@ bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
|
||||
else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK)
|
||||
return false;
|
||||
else {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_send_handler: writev failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_send_handler: writev failed: %s (%d)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -305,9 +305,9 @@ bool mca_oob_tcp_msg_recv_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
|
||||
if(msg->msg_hdr.msg_size > 0) {
|
||||
msg->msg_rwbuf = malloc(msg->msg_hdr.msg_size);
|
||||
if(NULL == msg->msg_rwbuf) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv_handler: malloc(%d) failed\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv_handler: malloc(%d) failed\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
msg->msg_hdr.msg_size);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
return false;
|
||||
@ -321,9 +321,9 @@ bool mca_oob_tcp_msg_recv_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
|
||||
msg->msg_rwnum = 0;
|
||||
}
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv_handler: size %lu\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv_handler: size %lu\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
(unsigned long)(msg->msg_hdr.msg_size) );
|
||||
}
|
||||
}
|
||||
@ -362,9 +362,9 @@ static bool mca_oob_tcp_msg_recv(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pee
|
||||
under UNIX/Linux environments */
|
||||
else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK)
|
||||
return false;
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv: readv failed: %s (%d)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -372,9 +372,9 @@ static bool mca_oob_tcp_msg_recv(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pee
|
||||
return false;
|
||||
} else if (rc == 0) {
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: peer closed connection",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv: peer closed connection",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)));
|
||||
}
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
mca_oob_call_exception_handlers(&peer->peer_name, MCA_OOB_PEER_DISCONNECTED);
|
||||
@ -417,9 +417,11 @@ void mca_oob_tcp_msg_recv_complete(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* p
|
||||
mca_oob_tcp_msg_data(msg,peer);
|
||||
break;
|
||||
default:
|
||||
opal_output(0, "%s mca_oob_tcp_msg_recv_complete: invalid message type: %d from peer %s\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name), msg->msg_hdr.msg_type,
|
||||
ORTE_NAME_PRINT(&peer->peer_name));
|
||||
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_msg_recv_complete: invalid message type: %d from peer [%lu,%lu,%lu]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name), msg->msg_hdr.msg_type,
|
||||
(long)(peer->peer_name.cellid),
|
||||
(long)(peer->peer_name.jobid),
|
||||
(long)(peer->peer_name.vpid));
|
||||
MCA_OOB_TCP_MSG_RETURN(msg);
|
||||
break;
|
||||
}
|
||||
|
@ -312,20 +312,20 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
||||
do {
|
||||
/* pick an address in round-robin fashion from the list exported by the peer */
|
||||
if(ORTE_SUCCESS != (rc = mca_oob_tcp_addr_get_next(peer->peer_addr, (struct sockaddr_storage*) &inaddr))) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
|
||||
"mca_oob_tcp_addr_get_next failed with error=%d",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
rc);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
|
||||
"connecting port %d to: %s:%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
/* Bug, FIXME: output tcp6_listen_port for AF_INET6 */
|
||||
ntohs(mca_oob_tcp_component.tcp_listen_port),
|
||||
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
||||
@ -380,10 +380,10 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
|
||||
"connect to %s:%d failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
||||
opal_net_get_port((struct sockaddr*) &inaddr),
|
||||
strerror(opal_socket_errno),
|
||||
@ -408,10 +408,10 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
||||
return ORTE_SUCCESS;
|
||||
} else {
|
||||
opal_output(0,
|
||||
"%s-%s mca_oob_tcp_peer_try_connect: "
|
||||
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
|
||||
"mca_oob_tcp_peer_send_connect_ack to %s:%d failed: %s (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
||||
opal_net_get_port((struct sockaddr*) &inaddr),
|
||||
opal_strerror(rc),
|
||||
@ -420,10 +420,10 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
|
||||
} while(peer->peer_addr->addr_next != 0);
|
||||
|
||||
/* None of the interfaces worked.. */
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
|
||||
"connect to %s:%d failed, connecting over all interfaces failed!",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
opal_net_get_hostname((struct sockaddr*) &inaddr),
|
||||
opal_net_get_port((struct sockaddr*) &inaddr));
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -455,9 +455,9 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
|
||||
#endif
|
||||
struct timeval tv = { 1,0 };
|
||||
opal_output(0,
|
||||
"%s-%s mca_oob_tcp_peer_start_connect: socket() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: socket() failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
@ -481,17 +481,17 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
|
||||
/* setup the socket as non-blocking */
|
||||
if (peer->peer_sd >= 0) {
|
||||
if((flags = fcntl(peer->peer_sd, F_GETFL, 0)) < 0) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(peer->peer_sd, F_SETFL, flags) < 0)
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
}
|
||||
@ -499,16 +499,16 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
|
||||
#if OPAL_WANT_IPV6
|
||||
if (peer->peer6_sd >= 0) {
|
||||
if((flags = fcntl(peer->peer6_sd, F_GETFL, 0)) < 0) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed with errno=%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed with errno=%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
opal_socket_errno);
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(peer->peer6_sd, F_SETFL, flags) < 0)
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed with errno=%d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed with errno=%d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
opal_socket_errno);
|
||||
}
|
||||
}
|
||||
@ -548,9 +548,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
|
||||
|
||||
/* check connect completion status */
|
||||
if(getsockopt(sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: getsockopt() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: getsockopt() failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -573,17 +573,17 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
|
||||
return;
|
||||
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
|
||||
struct timeval tv = { 1,0 };
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
|
||||
"connection failed: %s (%d) - retrying\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(so_error),
|
||||
so_error);
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
|
||||
"sending ack, %d",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)), so_error);
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)), so_error);
|
||||
}
|
||||
|
||||
mca_oob_tcp_peer_shutdown(peer);
|
||||
@ -612,9 +612,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
|
||||
opal_event_add(&peer->peer_recv_event, 0);
|
||||
#endif
|
||||
} else {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: unable to send connect ack.",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: unable to send connect ack.",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)));
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
}
|
||||
}
|
||||
@ -673,9 +673,9 @@ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd)
|
||||
void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
{
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_close(%p) sd %d state %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_close(%p) sd %d state %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
(void*)peer,
|
||||
peer->peer_sd,
|
||||
peer->peer_state);
|
||||
@ -683,9 +683,9 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
|
||||
|
||||
#if OPAL_WANT_IPV6
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_close(%p) sd6 %d state %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_close(%p) sd6 %d state %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
(void*)peer,
|
||||
peer->peer6_sd,
|
||||
peer->peer_state);
|
||||
@ -713,9 +713,9 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
||||
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
||||
mca_oob_tcp_msg_t *msg;
|
||||
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_shutdown: retries exceeded",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_shutdown: retries exceeded",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)));
|
||||
|
||||
/* There are cases during the initial connection setup where
|
||||
the peer_send_msg is NULL but there are things in the queue
|
||||
@ -802,10 +802,10 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd)
|
||||
struct timeval tv = { 1,0 };
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
|
||||
opal_output(0,
|
||||
"%s-%s mca_oob_tcp_peer_recv_connect_ack "
|
||||
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack "
|
||||
"connect failed during receive. Restarting (%s).",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno));
|
||||
}
|
||||
opal_event_del(&peer->peer_recv_event);
|
||||
@ -830,11 +830,11 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd)
|
||||
|
||||
/* compare the peers name to the expected value */
|
||||
if (0 != orte_ns.compare_fields(ORTE_NS_CMP_ALL, &peer->peer_name, &hdr.msg_src)) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_connect_ack: "
|
||||
"received unexpected process identifier %s\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_PRINT(&(hdr.msg_src)));
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack: "
|
||||
"received unexpected process identifier [%ld,%ld,%ld]\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(&(hdr.msg_src)));
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
return ORTE_ERR_UNREACH;
|
||||
}
|
||||
@ -844,7 +844,8 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd)
|
||||
* comparison - we do NOT want wildcard values to return EQUAL
|
||||
*/
|
||||
if(orte_process_info.my_name == NULL) {
|
||||
orte_ns.create_process_name(&orte_process_info.my_name, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
|
||||
orte_ns.create_process_name(&orte_process_info.my_name,
|
||||
hdr.msg_dst.cellid, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
|
||||
} else if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, orte_process_info.my_name, ORTE_NAME_INVALID) == ORTE_EQUAL) {
|
||||
*orte_process_info.my_name = hdr.msg_dst;
|
||||
}
|
||||
@ -872,10 +873,10 @@ static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, int sd, void
|
||||
/* remote closed connection */
|
||||
if(retval == 0) {
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_blocking: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_blocking: "
|
||||
"peer closed connection: peer state %d",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
peer->peer_state);
|
||||
}
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -905,10 +906,10 @@ static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, int sd, void
|
||||
return -1;
|
||||
} else {
|
||||
opal_output(0,
|
||||
"%s-%s mca_oob_tcp_peer_recv_blocking: "
|
||||
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_blocking: "
|
||||
"recv() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(errno),
|
||||
errno);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -934,9 +935,9 @@ static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, int sd, void
|
||||
int retval = send(sd, (char *)ptr+cnt, size-cnt, 0);
|
||||
if(retval < 0) {
|
||||
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_send_blocking: send() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_send_blocking: send() failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
@ -1010,9 +1011,9 @@ static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
|
||||
mca_oob_tcp_msg_t* msg;
|
||||
MCA_OOB_TCP_MSG_ALLOC(msg, rc);
|
||||
if(NULL == msg) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)));
|
||||
return;
|
||||
}
|
||||
msg->msg_type = MCA_OOB_TCP_UNEXPECTED;
|
||||
@ -1040,9 +1041,9 @@ static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
|
||||
}
|
||||
default:
|
||||
{
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_handler: invalid socket state(%d)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
peer->peer_state);
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
break;
|
||||
@ -1087,9 +1088,9 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user)
|
||||
break;
|
||||
}
|
||||
default:
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: invalid connection state (%d)",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_send_handler: invalid connection state (%d)",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
peer->peer_state);
|
||||
opal_event_del(&peer->peer_send_event);
|
||||
break;
|
||||
@ -1158,9 +1159,9 @@ static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
|
||||
nodelay = 0;
|
||||
#endif
|
||||
|
||||
sprintf(buff, "%s-%s %s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
sprintf(buff, "[%lu,%lu,%lu]-[%lu,%lu,%lu] %s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
msg, src, dst, nodelay, sndbuf, rcvbuf, flags);
|
||||
opal_output(0, buff);
|
||||
}
|
||||
@ -1193,10 +1194,10 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd)
|
||||
mca_oob_tcp_peer_event_init(peer);
|
||||
|
||||
if(mca_oob_tcp_peer_send_connect_ack(peer, sd) != ORTE_SUCCESS) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_peer_accept: "
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_accept: "
|
||||
"mca_oob_tcp_peer_send_connect_ack failed\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)));
|
||||
mca_oob_tcp_peer_close(peer);
|
||||
OPAL_THREAD_UNLOCK(&peer->peer_lock);
|
||||
return false;
|
||||
|
@ -91,9 +91,9 @@ int mca_oob_tcp_ping(
|
||||
/* parse uri string */
|
||||
if(ORTE_SUCCESS != (rc = mca_oob_tcp_parse_uri(uri, &inaddr))) {
|
||||
opal_output(0,
|
||||
"%s-%s mca_oob_tcp_ping: invalid uri: %s\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)name),
|
||||
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: invalid uri: %s\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(name),
|
||||
uri);
|
||||
return rc;
|
||||
}
|
||||
@ -106,9 +106,9 @@ int mca_oob_tcp_ping(
|
||||
#endif
|
||||
if (sd < 0) {
|
||||
opal_output(0,
|
||||
"%s-%s mca_oob_tcp_ping: socket() failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)name),
|
||||
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: socket() failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(name),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
return ORTE_ERR_UNREACH;
|
||||
@ -116,17 +116,17 @@ int mca_oob_tcp_ping(
|
||||
|
||||
/* setup the socket as non-blocking */
|
||||
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_ping: fcntl(F_GETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)name),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: fcntl(F_GETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(name),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
} else {
|
||||
flags |= O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)name),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(name),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
}
|
||||
@ -154,9 +154,9 @@ int mca_oob_tcp_ping(
|
||||
/* set socket back to blocking */
|
||||
flags &= ~O_NONBLOCK;
|
||||
if(fcntl(sd, F_SETFL, flags) < 0) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT((orte_process_name_t*)name),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(name),
|
||||
strerror(opal_socket_errno),
|
||||
opal_socket_errno);
|
||||
}
|
||||
|
@ -46,9 +46,9 @@ int mca_oob_tcp_recv(
|
||||
int i, rc = 0, size = 0;
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_recv: tag %d\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(peer),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%ld,%ld,%ld] mca_oob_tcp_recv: tag %d\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(peer),
|
||||
tag);
|
||||
}
|
||||
|
||||
@ -65,9 +65,9 @@ int mca_oob_tcp_recv(
|
||||
}
|
||||
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_recv*unexpected*: tag %d size %lu\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(peer),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%ld,%ld,%ld] mca_oob_tcp_recv*unexpected*: tag %d size %lu\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(peer),
|
||||
tag, (unsigned long)(msg->msg_hdr.msg_size) );
|
||||
}
|
||||
/* if we are returning an allocated buffer - just take it from the message */
|
||||
@ -117,9 +117,9 @@ int mca_oob_tcp_recv(
|
||||
}
|
||||
|
||||
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_recv*expected*: tag %d size %lu\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(peer),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%ld,%ld,%ld] mca_oob_tcp_recv*expected*: tag %d size %lu\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(peer),
|
||||
tag, (unsigned long)(size) );
|
||||
}
|
||||
|
||||
|
@ -107,9 +107,9 @@ int mca_oob_tcp_send(
|
||||
}
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_send: tag %d size %lu\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_send: tag %d size %lu\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
tag, (unsigned long)size );
|
||||
}
|
||||
|
||||
@ -214,9 +214,9 @@ int mca_oob_tcp_send_nb(
|
||||
}
|
||||
|
||||
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
|
||||
opal_output(0, "%s-%s mca_oob_tcp_send_nb: tag %d size %lu\n",
|
||||
ORTE_NAME_PRINT(orte_process_info.my_name),
|
||||
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_send_nb: tag %d size %lu\n",
|
||||
ORTE_NAME_ARGS(orte_process_info.my_name),
|
||||
ORTE_NAME_ARGS(&(peer->peer_name)),
|
||||
tag, (unsigned long)size );
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,376 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/orte_constants.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmgr/rmgr.h"
|
||||
|
||||
#include "orte/mca/pls/base/pls_private.h"
|
||||
|
||||
static void orte_pls_daemon_info_construct(orte_pls_daemon_info_t* ptr)
|
||||
{
|
||||
ptr->cell = ORTE_CELLID_INVALID;
|
||||
ptr->nodename = NULL;
|
||||
ptr->name = NULL;
|
||||
ptr->active_job = ORTE_JOBID_INVALID;
|
||||
}
|
||||
|
||||
/* destructor - used to free any resources held by instance */
|
||||
static void orte_pls_daemon_info_destructor(orte_pls_daemon_info_t* ptr)
|
||||
{
|
||||
if (NULL != ptr->nodename) free(ptr->nodename);
|
||||
if (NULL != ptr->name) free(ptr->name);
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(orte_pls_daemon_info_t, /* type name */
|
||||
opal_list_item_t, /* parent "class" name */
|
||||
orte_pls_daemon_info_construct, /* constructor */
|
||||
orte_pls_daemon_info_destructor); /* destructor */
|
||||
|
||||
/*
|
||||
* Store the active daemons for a job
|
||||
*/
|
||||
int orte_pls_base_store_active_daemons(opal_list_t *daemons)
|
||||
{
|
||||
orte_pls_daemon_info_t *dmn;
|
||||
opal_list_item_t *item;
|
||||
orte_gpr_value_t **values;
|
||||
char *jobid_string, *key;
|
||||
int rc, i, num_daemons;
|
||||
|
||||
/* determine the number of daemons */
|
||||
num_daemons = (int)opal_list_get_size(daemons);
|
||||
|
||||
if (0 == num_daemons) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* since each daemon gets recorded in a separate node's container,
|
||||
* we need to allocate space for num_daemons value objects
|
||||
*/
|
||||
values = (orte_gpr_value_t**)malloc(num_daemons * sizeof(orte_gpr_value_t*));
|
||||
if (NULL == values) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
memset(values, 0, num_daemons*sizeof(orte_gpr_value_t*)); /* NULL the array */
|
||||
|
||||
/* loop through the values and the list and create all the value objects */
|
||||
item = opal_list_get_first(daemons);
|
||||
for (i=0; i < num_daemons; i++) {
|
||||
dmn = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&values[i],
|
||||
ORTE_GPR_OVERWRITE,
|
||||
ORTE_NODE_SEGMENT,
|
||||
1, 0))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens),
|
||||
dmn->cell, dmn->nodename))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* setup the key */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, dmn->active_job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(values[0]);
|
||||
return rc;
|
||||
}
|
||||
asprintf(&key, "%s-%s", ORTE_NODE_BOOTPROXY_KEY, jobid_string);
|
||||
free(jobid_string);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), key, ORTE_NAME, dmn->name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
free(key);
|
||||
item = opal_list_get_next(item);
|
||||
}
|
||||
|
||||
rc = orte_gpr.put(num_daemons, values);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
for (i=0; i < num_daemons; i++) {
|
||||
if (NULL != values[i]) OBJ_RELEASE(values[i]);
|
||||
}
|
||||
if (NULL != values) free(values);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
|
||||
{
|
||||
orte_gpr_value_t **values;
|
||||
orte_gpr_keyval_t *kv;
|
||||
orte_std_cntr_t cnt, i, j;
|
||||
char* jobid_string;
|
||||
char *keys[] = {
|
||||
NULL, /* placeholder */
|
||||
ORTE_NODE_NAME_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
NULL
|
||||
};
|
||||
orte_cellid_t *cell;
|
||||
char *nodename;
|
||||
orte_process_name_t *name;
|
||||
orte_pls_daemon_info_t *dmn, *dmn2;
|
||||
bool found_name, found_node, found_cell;
|
||||
opal_list_item_t *item;
|
||||
bool check_dups;
|
||||
int rc;
|
||||
|
||||
/* check the list to see if there is anything already on it. If there is, then
|
||||
* we will need to check for duplicate entries before we add something. If not,
|
||||
* then this can go a lot faster
|
||||
*/
|
||||
if (0 < opal_list_get_size(daemons)) {
|
||||
check_dups = true;
|
||||
} else {
|
||||
check_dups = false;
|
||||
}
|
||||
|
||||
/* setup the key */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
asprintf(&keys[0], "%s-%s", ORTE_NODE_BOOTPROXY_KEY, jobid_string);
|
||||
free(jobid_string);
|
||||
|
||||
/* query the daemon info */
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
ORTE_NODE_SEGMENT,
|
||||
NULL, /* all containers */
|
||||
keys,
|
||||
&cnt, &values))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(keys[0]);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* loop through the answers and construct the list */
|
||||
for (i=0; i < cnt; i++) {
|
||||
/* for systems such as bproc, the node segment holds containers
|
||||
* for nodes that we may not have launched upon. Each container
|
||||
* will send us back a value object, so we have to ensure here
|
||||
* that we only create daemon objects on the list for those nodes
|
||||
* that DO provide a valid object
|
||||
*/
|
||||
found_name = found_node = found_cell = false;
|
||||
for (j=0; j < values[i]->cnt; j++) {
|
||||
kv = values[i]->keyvals[j];
|
||||
if (0 == strcmp(kv->key, keys[0])) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&name, kv->value, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
found_name = true;
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(kv->key, ORTE_NODE_NAME_KEY)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nodename, kv->value, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
found_node = true;
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(kv->key, ORTE_CELLID_KEY)) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cell, kv->value, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
found_cell = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
/* if we found everything, then this is a valid entry */
|
||||
if (found_name && found_node && found_cell) {
|
||||
/* first check if this name is ourself - if so, ignore it */
|
||||
if (ORTE_EQUAL == orte_dss.compare(name, ORTE_PROC_MY_NAME, ORTE_NAME)) {
|
||||
goto MOVEON;
|
||||
}
|
||||
|
||||
if (check_dups) {
|
||||
/* see if this daemon is already on the list - if so, then we don't add it */
|
||||
for (item = opal_list_get_first(daemons);
|
||||
item != opal_list_get_end(daemons);
|
||||
item = opal_list_get_next(item)) {
|
||||
dmn2 = (orte_pls_daemon_info_t*)item;
|
||||
|
||||
if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) {
|
||||
/* already on list - ignore it */
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
dmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
if (NULL == dmn) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(dmn);
|
||||
goto CLEANUP;
|
||||
}
|
||||
dmn->cell = *cell;
|
||||
if (NULL != nodename) {
|
||||
dmn->nodename = strdup(nodename);
|
||||
}
|
||||
|
||||
/* add this daemon to the list */
|
||||
opal_list_append(daemons, &dmn->super);
|
||||
}
|
||||
MOVEON:
|
||||
OBJ_RELEASE(values[i]);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
for (i=0; i < cnt; i++) {
|
||||
if (NULL != values[i]) OBJ_RELEASE(values[i]);
|
||||
}
|
||||
if (NULL != values) free(values);
|
||||
free(keys[0]);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Retrieve a list of the active daemons for a job
|
||||
*/
|
||||
int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs)
|
||||
{
|
||||
orte_jobid_t *jobs;
|
||||
orte_std_cntr_t njobs, i;
|
||||
bool allocated;
|
||||
int rc;
|
||||
|
||||
if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_DESCENDANTS)) {
|
||||
/* need to include all descendants in list */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&jobs, &njobs, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
allocated = true;
|
||||
} else if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_CHILDREN)) {
|
||||
/* just include the direct children of the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_job_children(&jobs, &njobs, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
allocated = true;
|
||||
} else {
|
||||
/* just want daemons for this one job */
|
||||
jobs = &job;
|
||||
njobs = 1;
|
||||
allocated = false;
|
||||
}
|
||||
|
||||
/* loop through all the jobs and get their info */
|
||||
for (i=0; i < njobs; i++) {
|
||||
if (ORTE_SUCCESS != (rc = get_daemons(daemons, jobs[i]))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
if (allocated) free(jobs);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a daemon from the world of active daemons
|
||||
*/
|
||||
int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info)
|
||||
{
|
||||
/* We need to do a registry
|
||||
* delete function call targeting the entry
|
||||
*/
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Check for available daemons we can re-use
|
||||
*/
|
||||
int orte_pls_base_check_avail_daemons(opal_list_t *daemons,
|
||||
orte_jobid_t job)
|
||||
{
|
||||
orte_jobid_t root, *descendants;
|
||||
orte_std_cntr_t i, ndesc;
|
||||
int rc;
|
||||
|
||||
/* check for daemons belonging to any job in this job's family.
|
||||
* Since the jobs in any family must exit together, it is reasonable
|
||||
* for us to reuse any daemons that were spawned by any member
|
||||
* of our extended family. We can find all of our family members
|
||||
* by first finding our root job, and then getting all of its
|
||||
* descendants
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, job))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&descendants, &ndesc, root))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* loop through the descendants, adding to the daemon list as we go */
|
||||
for (i=0; i < ndesc; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, descendants[i], NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(descendants);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
free(descendants); /* all done with these */
|
||||
|
||||
/* now add in any persistent daemons - they are tagged as bootproxies
|
||||
* for jobid = 0
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -423,6 +423,16 @@ static void orte_pls_bproc_setup_env(char *** env)
|
||||
|
||||
/**
|
||||
* Launches the daemons
|
||||
* @param cellid the cellid of the job
|
||||
* @param envp a pointer to the environment to use for the daemons
|
||||
* @param node_arrays an array that holds the node arrays for each app context
|
||||
* @param node_array_lens an array of lengths of the node arrays
|
||||
* @param num_contexts the number of application contexts
|
||||
* @param num_procs the numer of processes in the job
|
||||
* @param global_vpid_start the starting vpid for the user's processes
|
||||
* @param jobid the jobid for the user processes
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
* you encounter an error so that orterun will be woken up and
|
||||
@ -519,7 +529,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
free(var);
|
||||
|
||||
/* set up the base environment so the daemons can get their names once launched */
|
||||
rc = orte_ns_nds_bproc_put(0, map->daemon_vpid_start,
|
||||
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, map->daemon_vpid_start,
|
||||
0, num_daemons, ORTE_VPID_INVALID, 1, envp);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -626,7 +636,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
goto cleanup;
|
||||
}
|
||||
rc = orte_pls_bproc_set_node_pid(param, map->job, pids[i]);
|
||||
rc = orte_pls_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -764,6 +774,17 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
|
||||
|
||||
/**
|
||||
* Launches the application processes
|
||||
* @param cellid the cellid of the job
|
||||
* @param jobid the jobid of the job
|
||||
* @param map a pointer to the mapping of this application
|
||||
* @param num_processes the number of processes in this job
|
||||
* @param vpid_start the starting vpid for this app context
|
||||
* @param global_vpid_start the starting vpid for the user's processes
|
||||
* @param app_context the application context number
|
||||
* @param node_array the node array for this context
|
||||
* @param node_array_len the length of the node array
|
||||
* @retval ORTE_SUCCESS
|
||||
* @retval error
|
||||
*/
|
||||
|
||||
/* When working in this function, ALWAYS jump to "cleanup" if
|
||||
@ -887,7 +908,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
}
|
||||
|
||||
/* setup environment so the procs can figure out their names */
|
||||
rc = orte_ns_nds_bproc_put(map->job, vpid_start, map->vpid_start,
|
||||
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, map->job, vpid_start, map->vpid_start,
|
||||
num_processes, i, num_cycles, &env);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -943,7 +964,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
} else {
|
||||
rc = orte_ns.create_process_name(&proc_name, map->job,
|
||||
rc = orte_ns.create_process_name(&proc_name, ORTE_PROC_MY_NAME->cellid, map->job,
|
||||
vpid_start + j*stride);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -93,7 +93,7 @@ ORTE_DECLSPEC int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t** pids,
|
||||
/**
|
||||
* Utility routine to get/set daemon pid
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_pls_bproc_set_node_pid(char* node_name, orte_jobid_t jobid, pid_t pid);
|
||||
ORTE_DECLSPEC int orte_pls_bproc_set_node_pid(orte_cellid_t cellid, char* node_name, orte_jobid_t jobid, pid_t pid);
|
||||
ORTE_DECLSPEC int orte_pls_bproc_get_node_pids(orte_jobid_t jobid, pid_t** pids, orte_std_cntr_t* num_pids);
|
||||
|
||||
/* utility functions for abort communications */
|
||||
|
@ -188,7 +188,7 @@ cleanup:
|
||||
* the daemons.
|
||||
*/
|
||||
|
||||
int orte_pls_bproc_set_node_pid(char* node_name, orte_jobid_t jobid, pid_t pid)
|
||||
int orte_pls_bproc_set_node_pid(orte_cellid_t cellid, char* node_name, orte_jobid_t jobid, pid_t pid)
|
||||
{
|
||||
orte_gpr_value_t *values[1];
|
||||
char *jobid_string, *key;
|
||||
@ -202,7 +202,7 @@ int orte_pls_bproc_set_node_pid(char* node_name, orte_jobid_t jobid, pid_t pid)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(values[0]->tokens), &(values[0]->num_tokens), node_name))) {
|
||||
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(values[0]->tokens), &(values[0]->num_tokens), cellid, node_name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(values[0]);
|
||||
return rc;
|
||||
|
@ -220,6 +220,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
* their own name on the other end
|
||||
*/
|
||||
name.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
name.jobid = 0;
|
||||
name.vpid = map->daemon_vpid_start;
|
||||
rc = orte_ns.get_proc_name_string(&name_string, &name);
|
||||
|
@ -214,7 +214,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
fprintf(hfp,"%s\n",node->node_name);
|
||||
|
||||
/* initialize daemons process name */
|
||||
rc = orte_ns.create_process_name(&name, 0, vpid);
|
||||
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
|
@ -256,6 +256,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
||||
/* tell the new daemons the base of the name list so they can compute
|
||||
* their own name on the other end
|
||||
*/
|
||||
name.cellid = ORTE_PROC_MY_NAME->cellid;
|
||||
name.jobid = 0;
|
||||
name.vpid = map->daemon_vpid_start;
|
||||
rc = orte_ns.get_proc_name_string(&name_string, &name);
|
||||
|
@ -240,6 +240,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
|
||||
{
|
||||
int i, fanout, rc;
|
||||
int num_processes = 0;
|
||||
orte_cellid_t cellid;
|
||||
opal_list_item_t *node_item, *proc_item;
|
||||
orte_job_map_t *map;
|
||||
orte_vpid_t vpid_start, vpid_range;
|
||||
@ -260,6 +261,9 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* get the cellid */
|
||||
cellid = orte_process_info.my_name->cellid;
|
||||
|
||||
/* create num_apps of pointers to Xpnodeset and Xpcommand */
|
||||
node_sets = (Xpnodeset **) malloc(num_apps * sizeof(Xpnodeset *));
|
||||
xcmd_sets = (Xpcommand **) malloc(num_apps * sizeof(Xpcommand *));
|
||||
@ -294,7 +298,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
for (i = 0; i < num_apps; i++) {
|
||||
rc = orte_ns_nds_xcpu_put(jobid, vpid_start,
|
||||
rc = orte_ns_nds_xcpu_put(cellid, jobid, vpid_start,
|
||||
num_processes, &map->apps[i]->env);
|
||||
if (rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -33,7 +33,10 @@ int orte_ras_base_compare_node(orte_ras_node_t *value1, orte_ras_node_t *value2,
|
||||
{
|
||||
int test;
|
||||
|
||||
/** check node names */
|
||||
if (value1->node_cellid > value2->node_cellid) return ORTE_VALUE1_GREATER;
|
||||
if (value2->node_cellid > value1->node_cellid) return ORTE_VALUE2_GREATER;
|
||||
|
||||
/** same cell - check node names */
|
||||
test = strcmp(value1->node_name, value2->node_name);
|
||||
if (0 == test) return ORTE_EQUAL;
|
||||
if (0 < test) return ORTE_VALUE2_GREATER;
|
||||
|
@ -47,6 +47,7 @@ int orte_ras_base_copy_node(orte_ras_node_t **dest, orte_ras_node_t *src, orte_d
|
||||
if (NULL != src->node_name) (*dest)->node_name = strdup(src->node_name);
|
||||
(*dest)->launch_id = src->launch_id;
|
||||
if (NULL != src->node_arch) (*dest)->node_arch = strdup(src->node_arch);
|
||||
(*dest)->node_cellid = src->node_cellid;
|
||||
(*dest)->node_state = src->node_state;
|
||||
(*dest)->node_slots = src->node_slots;
|
||||
(*dest)->node_slots_inuse = src->node_slots_inuse;
|
||||
|
@ -65,6 +65,13 @@ int orte_ras_base_pack_node(orte_buffer_t *buffer, const void *src,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the cellid */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->node_cellid)), 1, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the state */
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer,
|
||||
(void*)(&(nodes[i]->node_state)), 1, ORTE_NODE_STATE))) {
|
||||
|
@ -48,8 +48,8 @@ int orte_ras_base_print_node(char **output, char *prefix, orte_ras_node_t *src,
|
||||
asprintf(&pfx2, "%s", prefix);
|
||||
}
|
||||
|
||||
asprintf(&tmp, "%sData for node: Name: %s\tLaunch id: %ld",
|
||||
pfx2, src->node_name, (long)src->launch_id);
|
||||
asprintf(&tmp, "%sData for node: cellid: %lu\tName: %s\tLaunch id: %ld",
|
||||
pfx2, (unsigned long)src->node_cellid, src->node_name, (long)src->launch_id);
|
||||
|
||||
asprintf(&tmp2, "%s\n%s\tArch: %s\tState: %lu", tmp, pfx2,
|
||||
src->node_arch, (unsigned long)src->node_state);
|
||||
|
@ -77,6 +77,14 @@ int orte_ras_base_unpack_node(orte_buffer_t *buffer, void *dest,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the cellid */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
|
||||
(&(nodes[i]->node_cellid)), &n, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the state */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
|
||||
|
@ -51,7 +51,7 @@ int orte_ras_base_proc_query_alloc_no_op(opal_list_t* list)
|
||||
return ORTE_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
orte_ras_node_t* orte_ras_base_node_lookup_no_op(const char* nodename)
|
||||
orte_ras_node_t* orte_ras_base_node_lookup_no_op(orte_cellid_t cell, const char* nodename)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
@ -37,6 +37,7 @@ static void orte_ras_base_node_construct(orte_ras_node_t* node)
|
||||
node->node_name = NULL;
|
||||
node->launch_id = -1;
|
||||
node->node_arch = NULL;
|
||||
node->node_cellid = 0;
|
||||
node->node_state = ORTE_NODE_STATE_UNKNOWN;
|
||||
node->node_slots = 0;
|
||||
node->node_slots_inuse = 0;
|
||||
@ -107,10 +108,12 @@ int orte_ras_base_node_query(opal_list_t* nodes)
|
||||
ORTE_NODE_SLOTS_ALLOC_KEY,
|
||||
ORTE_NODE_SLOTS_MAX_KEY,
|
||||
ORTE_NODE_USERNAME_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
NULL
|
||||
};
|
||||
orte_std_cntr_t i, cnt, *sptr;
|
||||
orte_node_state_t *nsptr;
|
||||
orte_cellid_t *cptr;
|
||||
int32_t *i32;
|
||||
orte_gpr_value_t** values;
|
||||
int rc;
|
||||
@ -214,6 +217,14 @@ int orte_ras_base_node_query(opal_list_t* nodes)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyval->value, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
node->node_cellid = *cptr;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
opal_list_append(nodes, &node->super);
|
||||
OBJ_RELEASE(value);
|
||||
@ -313,6 +324,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
ORTE_NODE_SLOTS_ALLOC_KEY,
|
||||
ORTE_NODE_SLOTS_MAX_KEY,
|
||||
ORTE_NODE_USERNAME_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
NULL
|
||||
};
|
||||
orte_std_cntr_t i, cnt, keys_len;
|
||||
@ -320,6 +332,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
char* jobid_str;
|
||||
orte_std_cntr_t *sptr;
|
||||
orte_node_state_t *nsptr;
|
||||
orte_cellid_t *cptr;
|
||||
int32_t *i32;
|
||||
int rc, alloc_key_posn=5;
|
||||
|
||||
@ -446,7 +459,15 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyval->value, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
node->node_cellid = *cptr;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
/* check to see if any slots were reserved on this node for us
|
||||
* The "get" command will return data from ALL nodes on the node
|
||||
* segment. We ONLY want to include here nodes that are assigned
|
||||
@ -473,7 +494,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
* Query the registry for a specific node
|
||||
*/
|
||||
|
||||
orte_ras_node_t* orte_ras_base_node_lookup(const char* node_name)
|
||||
orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t cellid, const char* node_name)
|
||||
{
|
||||
char* keys[] = {
|
||||
ORTE_NODE_NAME_KEY,
|
||||
@ -485,18 +506,20 @@ orte_ras_node_t* orte_ras_base_node_lookup(const char* node_name)
|
||||
ORTE_NODE_SLOTS_ALLOC_KEY,
|
||||
ORTE_NODE_SLOTS_MAX_KEY,
|
||||
ORTE_NODE_USERNAME_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
NULL
|
||||
};
|
||||
orte_ras_node_t* node = NULL;
|
||||
orte_std_cntr_t i, cnt, num_tokens;
|
||||
orte_std_cntr_t *sptr;
|
||||
orte_cellid_t *cptr;
|
||||
orte_node_state_t *nsptr;
|
||||
int32_t *i32;
|
||||
orte_gpr_value_t** values;
|
||||
char** tokens = NULL;
|
||||
int rc;
|
||||
|
||||
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, (char*)node_name);
|
||||
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, cellid, (char*)node_name);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return NULL;
|
||||
@ -601,6 +624,14 @@ orte_ras_node_t* orte_ras_base_node_lookup(const char* node_name)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) {
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyval->value, ORTE_CELLID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
continue;
|
||||
}
|
||||
node->node_cellid = *cptr;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(values[i]);
|
||||
break;
|
||||
@ -627,6 +658,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
ORTE_NODE_LAUNCH_ID_KEY,
|
||||
ORTE_NODE_ARCH_KEY,
|
||||
ORTE_NODE_STATE_KEY,
|
||||
ORTE_CELLID_KEY,
|
||||
ORTE_NODE_SLOTS_KEY,
|
||||
ORTE_NODE_SLOTS_IN_USE_KEY,
|
||||
ORTE_NODE_SLOTS_MAX_KEY,
|
||||
@ -637,6 +669,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
ORTE_INT32,
|
||||
ORTE_STRING,
|
||||
ORTE_NODE_STATE,
|
||||
ORTE_CELLID,
|
||||
ORTE_STD_CNTR,
|
||||
ORTE_STD_CNTR,
|
||||
ORTE_STD_CNTR,
|
||||
@ -666,7 +699,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
for (i=0; i < num_values; i++) {
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
|
||||
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
|
||||
ORTE_NODE_SEGMENT, 8, 0))) {
|
||||
ORTE_NODE_SEGMENT, 9, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
for (j=0; j < i; j++) {
|
||||
OBJ_RELEASE(values[j]);
|
||||
@ -705,6 +738,12 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
++j;
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->node_cellid)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
++j;
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->node_slots)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -730,7 +769,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
|
||||
}
|
||||
|
||||
/* setup index/keys for this node */
|
||||
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_name);
|
||||
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -752,11 +791,11 @@ cleanup:
|
||||
|
||||
|
||||
|
||||
int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid)
|
||||
int orte_ras_base_proc_insert(opal_list_t* procs, orte_cellid_t cellid, orte_jobid_t jobid)
|
||||
{
|
||||
opal_list_item_t* item;
|
||||
orte_gpr_value_t **values;
|
||||
orte_process_name_t proc_name;
|
||||
orte_process_name_t *proc_name;
|
||||
int rc;
|
||||
orte_std_cntr_t num_values, i, j;
|
||||
char *keys[] = {
|
||||
@ -803,8 +842,6 @@ int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid)
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
proc_name.jobid = jobid;
|
||||
for(i=0, item = opal_list_get_first(procs);
|
||||
i < num_values && item != opal_list_get_end(procs);
|
||||
i++, item = opal_list_get_next(item)) {
|
||||
@ -829,10 +866,14 @@ int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid)
|
||||
}
|
||||
|
||||
++j;
|
||||
rc = orte_ns.create_process_name(&proc_name, cellid, jobid, i);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* setup index/keys for this node */
|
||||
proc_name.vpid = (orte_vpid_t)i;
|
||||
rc = orte_schema.get_proc_tokens(&(values[i]->tokens), &(values[i]->num_tokens), &proc_name);
|
||||
rc = orte_schema.get_proc_tokens(&(values[i]->tokens), &(values[i]->num_tokens), proc_name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
@ -877,7 +918,7 @@ int orte_ras_base_node_delete(opal_list_t* nodes)
|
||||
node = (orte_ras_node_t*)item;
|
||||
|
||||
/* setup index/keys for this node */
|
||||
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, node->node_name);
|
||||
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, node->node_cellid, node->node_name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
@ -958,7 +999,7 @@ int orte_ras_base_node_assign(opal_list_t* nodes, orte_jobid_t jobid)
|
||||
continue;
|
||||
|
||||
/* setup index/keys for this node */
|
||||
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_name);
|
||||
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
free(jobid_str);
|
||||
|
@ -71,7 +71,7 @@ int orte_ras_base_node_query_alloc_no_op(opal_list_t*, orte_jobid_t);
|
||||
|
||||
int orte_ras_base_proc_query_alloc_no_op(opal_list_t* procs);
|
||||
|
||||
orte_ras_node_t* orte_ras_base_node_lookup_no_op(const char* nodename);
|
||||
orte_ras_node_t* orte_ras_base_node_lookup_no_op(orte_cellid_t, const char* nodename);
|
||||
|
||||
/*
|
||||
* Internal support functions
|
||||
@ -92,7 +92,7 @@ ORTE_DECLSPEC int orte_ras_base_node_query(opal_list_t*);
|
||||
/*
|
||||
* Query the registry for a specific node
|
||||
*/
|
||||
ORTE_DECLSPEC orte_ras_node_t* orte_ras_base_node_lookup(const char* nodename);
|
||||
ORTE_DECLSPEC orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t, const char* nodename);
|
||||
|
||||
/**
|
||||
* Query the registry for all nodes allocated to a specific job
|
||||
@ -106,7 +106,7 @@ ORTE_DECLSPEC int orte_ras_base_proc_query_alloc(opal_list_t* procs);
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_ras_base_node_insert(opal_list_t*);
|
||||
|
||||
ORTE_DECLSPEC int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid);
|
||||
ORTE_DECLSPEC int orte_ras_base_proc_insert(opal_list_t* procs, orte_cellid_t cellid, orte_jobid_t jobid);
|
||||
|
||||
/**
|
||||
* Delete the specified nodes from the registry
|
||||
|
@ -149,6 +149,9 @@ static int orte_ras_dash_host_allocate(orte_jobid_t jobid, opal_list_t *attribut
|
||||
node->node_name = strdup(mapped_nodes[i]);
|
||||
node->node_arch = NULL;
|
||||
node->node_state = ORTE_NODE_STATE_UP;
|
||||
/* JMS: this should not be hard-wired to 0, but there's no
|
||||
other value to put it to [yet]... */
|
||||
node->node_cellid = 0;
|
||||
node->node_slots_inuse = 0;
|
||||
node->node_slots_max = 0;
|
||||
node->node_slots = 1;
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче
Block a user