1
1

temporarily back our r15517 and 15520 so that I can get the RML / OOB changes

to cleanly apply

This commit was SVN r15527.

The following SVN revision numbers were found above:
  r15517 --> open-mpi/ompi@41977fcc95
Этот коммит содержится в:
Brian Barrett 2007-07-20 01:10:34 +00:00
родитель 824ef791f9
Коммит 2d17dd9516
190 изменённых файлов: 6389 добавлений и 935 удалений

Просмотреть файл

@ -1198,6 +1198,7 @@ AC_CONFIG_FILES([
orte/include/Makefile
orte/etc/Makefile
orte/tools/console/Makefile
orte/tools/orteboot/Makefile
orte/tools/orted/Makefile
orte/tools/ortehalt/Makefile

Просмотреть файл

@ -1068,7 +1068,7 @@ int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high )
ourproc = intercomm->c_local_group->grp_proc_pointers[0];
theirproc = intercomm->c_remote_group->grp_proc_pointers[0];
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
rc = orte_ns.compare_fields(mask, &(ourproc->proc_name), &(theirproc->proc_name));
if ( 0 > rc ) {
flag = true;

Просмотреть файл

@ -26,7 +26,6 @@
#include "btl_base_error.h"
#include "opal/util/show_help.h"
#include "orte/util/sys_info.h"
#include "orte/mca/ns/ns_types.h"
int mca_btl_base_debug;
@ -60,7 +59,8 @@ void mca_btl_base_error_no_nics(const char* transport,
char *procid;
if (mca_btl_base_warn_component_unused) {
/* print out no-nic warning if user told us to */
asprintf(&procid, "%s", ORTE_NAME_PRINT(orte_process_info.my_name));
asprintf(&procid, "[%lu,%lu,%lu]",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_show_help("help-mpi-btl-base.txt", "btl:no-nics",
true, procid, transport, orte_system_info.nodename,

Просмотреть файл

@ -27,7 +27,6 @@
#include "orte/util/proc_info.h"
#include "orte/util/sys_info.h"
#include "orte/mca/ns/ns_types.h"
OMPI_DECLSPEC extern int mca_btl_base_debug;
@ -36,9 +35,9 @@ extern int mca_btl_base_out(const char*, ...);
#define BTL_OUTPUT(args) \
do { \
mca_btl_base_out("[%s]%s[%s:%d:%s] ", \
mca_btl_base_out("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
orte_system_info.nodename, \
ORTE_NAME_PRINT(orte_process_info.my_name), \
ORTE_NAME_ARGS(orte_process_info.my_name), \
__FILE__, __LINE__, __func__); \
mca_btl_base_out args; \
mca_btl_base_out("\n"); \
@ -47,9 +46,9 @@ do { \
#define BTL_ERROR(args) \
do { \
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
mca_btl_base_err("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
orte_system_info.nodename, \
ORTE_NAME_PRINT(orte_process_info.my_name), \
ORTE_NAME_ARGS(orte_process_info.my_name), \
__FILE__, __LINE__, __func__); \
mca_btl_base_err args; \
mca_btl_base_err("\n"); \
@ -57,8 +56,8 @@ do { \
#define BTL_PEER_ERROR(proc, args) \
do { \
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
ORTE_NAME_PRINT(orte_process_info.my_name), \
mca_btl_base_err("[%ld,%ld,%ld][%s:%d:%s] from %s ", \
ORTE_NAME_ARGS(orte_process_info.my_name), \
__FILE__, __LINE__, __func__, \
orte_system_info.nodename); \
if(proc && proc->proc_hostname) { \
@ -73,9 +72,9 @@ do { \
#define BTL_DEBUG(args) \
do { \
if(mca_btl_base_debug) { \
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
mca_btl_base_err("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
orte_system_info.nodename, \
ORTE_NAME_PRINT(orte_process_info.my_name), \
ORTE_NAME_ARGS(orte_process_info.my_name), \
__FILE__, __LINE__, __func__); \
mca_btl_base_err args; \
mca_btl_base_err("\n"); \
@ -84,9 +83,9 @@ do { \
#define BTL_VERBOSE(args) \
do { \
if(mca_btl_base_debug > 1) { \
mca_btl_base_err("[%s]%s[%s:%d:%s] ", \
mca_btl_base_err("[%s][%ld,%ld,%ld][%s:%d:%s] ", \
orte_system_info.nodename, \
ORTE_NAME_PRINT(orte_process_info.my_name), \
ORTE_NAME_ARGS(orte_process_info.my_name), \
__FILE__, __LINE__, __func__); \
mca_btl_base_err args; \
mca_btl_base_err("\n"); \

Просмотреть файл

@ -429,10 +429,10 @@ static int mca_btl_gm_discover( void )
if(mca_btl_gm_component.gm_debug > 0) {
opal_output(0,
"%s gm_port %08lX, "
"[%ld,%ld,%ld] gm_port %08lX, "
"board %" PRIu32 ", global %" PRIu32 " "
"node %" PRIu32 "port %" PRIu32 "\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_ARGS(orte_process_info.my_name),
(unsigned long) port, board_no, global_id, node_id, port_no);
}

Просмотреть файл

@ -128,15 +128,15 @@ mca_btl_gm_proc_t* mca_btl_gm_proc_create(ompi_proc_t* ompi_proc)
(void*)&gm_proc->proc_addrs,
&size);
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(gm_proc);
return NULL;
}
if((size % sizeof(mca_btl_gm_addr_t)) != 0) {
opal_output(0, "[%s:%d] invalid gm address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] invalid gm address for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(gm_proc);
return NULL;
}
@ -189,9 +189,9 @@ int mca_btl_gm_proc_insert(
return OMPI_ERROR;
}
if(mca_btl_gm_component.gm_debug > 0) {
opal_output(0, "%s mapped global id %" PRIu32
opal_output(0, "[%ld,%ld,%ld] mapped global id %" PRIu32
" to node id %" PRIu32 "\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_ARGS(orte_process_info.my_name),
gm_endpoint->endpoint_addr.global_id,
gm_endpoint->endpoint_addr.node_id);
}

Просмотреть файл

@ -798,8 +798,8 @@ void mca_btl_mvapi_dump(
opal_output( 0, "No endpoint for this peer\n" );
return;
}
opal_output( 0, "endpoint with processor %s\n",
ORTE_NAME_PRINT( &(endpoint->endpoint_proc->proc_ompi->proc_name) ) );
opal_output( 0, "endpoint with processor (%lu.%lu.%lu)\n",
ORTE_NAME_ARGS( &(endpoint->endpoint_proc->proc_ompi->proc_name) ) );
opal_output( 0, "endpoint state: %s\n",
(endpoint->endpoint_state == MCA_BTL_IB_CONNECTING ? "connecting" :
(endpoint->endpoint_state == MCA_BTL_IB_CONNECT_ACK ? "waiting ack" :

Просмотреть файл

@ -140,15 +140,15 @@ mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc)
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(mvapi_proc);
return NULL;
}
if((size % sizeof(mca_btl_mvapi_port_info_t)) != 0) {
opal_output(0, "[%s:%d] invalid mvapi address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] invalid mvapi address for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(mvapi_proc);
return NULL;
}

Просмотреть файл

@ -121,8 +121,8 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
rc = ompi_modex_recv( &mca_btl_mx_component.super.btl_version,
ompi_proc, (void*)&mx_peers, &size );
if( OMPI_SUCCESS != rc ) {
opal_output( 0, "mca_pml_base_modex_recv failed for peer %s",
ORTE_NAME_PRINT(&ompi_proc->proc_name) );
opal_output( 0, "mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]",
ORTE_NAME_ARGS(&ompi_proc->proc_name) );
return NULL;
}
@ -130,8 +130,8 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
return NULL;
}
if( (size % sizeof(mca_btl_mx_addr_t)) != 0 ) {
opal_output( 0, "invalid mx address for peer %s",
ORTE_NAME_PRINT(&ompi_proc->proc_name) );
opal_output( 0, "invalid mx address for peer [%ld,%ld,%ld]",
ORTE_NAME_ARGS(&ompi_proc->proc_name) );
return NULL;
}

Просмотреть файл

@ -127,15 +127,15 @@ mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
if(OMPI_SUCCESS != rc) {
opal_output(0,
"[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
"[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}
if((size % sizeof(mca_btl_ud_addr_t)) != 0) {
opal_output(0, "[%s:%d] invalid module address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}

Просмотреть файл

@ -137,15 +137,15 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
if(OMPI_SUCCESS != rc) {
opal_output(mca_btl_base_output, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(mca_btl_base_output, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}
if((size % sizeof(mca_btl_openib_port_info_t)) != 0) {
opal_output(mca_btl_base_output, "[%s:%d] invalid module address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(mca_btl_base_output, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}

Просмотреть файл

@ -470,8 +470,8 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
ORTE_PROCESS_NAME_NTOH(guid);
/* compare this to the expected values */
if (0 != orte_ns.compare_fields(ORTE_NS_CMP_ALL, &btl_proc->proc_name, &guid)) {
BTL_ERROR(("received unexpected process identifier %s",
ORTE_NAME_PRINT(&guid)));
BTL_ERROR(("received unexpected process identifier [%lu,%lu,%lu]",
ORTE_NAME_ARGS(&guid)));
mca_btl_tcp_endpoint_close(btl_endpoint);
return OMPI_ERR_UNREACH;
}

Просмотреть файл

@ -129,15 +129,15 @@ mca_btl_udapl_proc_t* mca_btl_udapl_proc_create(ompi_proc_t* ompi_proc)
(void*)&udapl_proc->proc_addrs,
&size);
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(udapl_proc);
return NULL;
}
if((size % sizeof(mca_btl_udapl_addr_t)) != 0) {
opal_output(0, "[%s:%d] invalid udapl address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
opal_output(0, "[%s:%d] invalid udapl address for peer [%lu,%lu,%lu]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(udapl_proc);
return NULL;
}

Просмотреть файл

@ -435,6 +435,7 @@ void ompi_crcp_coord_pml_message_ref_construct(ompi_crcp_coord_pml_message_ref_t
msg_ref->comm = NULL;
msg_ref->request = NULL;
msg_ref->proc_name.cellid = ORTE_CELLID_INVALID;
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
@ -470,6 +471,7 @@ void ompi_crcp_coord_pml_message_ref_destruct( ompi_crcp_coord_pml_message_ref_t
msg_ref->request = NULL;
}
msg_ref->proc_name.cellid = ORTE_CELLID_INVALID;
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
@ -487,6 +489,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_peer_ref_t,
ompi_crcp_coord_pml_peer_ref_destruct);
void ompi_crcp_coord_pml_peer_ref_construct(ompi_crcp_coord_pml_peer_ref_t *peer_ref) {
peer_ref->proc_name.cellid = ORTE_CELLID_INVALID;
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
@ -518,6 +521,7 @@ void ompi_crcp_coord_pml_peer_ref_construct(ompi_crcp_coord_pml_peer_ref_t *peer
void ompi_crcp_coord_pml_peer_ref_destruct( ompi_crcp_coord_pml_peer_ref_t *peer_ref) {
opal_list_item_t* item = NULL;
peer_ref->proc_name.cellid = ORTE_CELLID_INVALID;
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
@ -589,6 +593,7 @@ OBJ_CLASS_INSTANCE(drain_msg_ack_ref_t,
void drain_msg_ack_ref_construct(drain_msg_ack_ref_t *msg_ack_ref) {
msg_ack_ref->complete = false;
msg_ack_ref->peer.cellid = ORTE_CELLID_INVALID;
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
}
@ -596,6 +601,7 @@ void drain_msg_ack_ref_construct(drain_msg_ack_ref_t *msg_ack_ref) {
void drain_msg_ack_ref_destruct( drain_msg_ack_ref_t *msg_ack_ref) {
msg_ack_ref->complete = false;
msg_ack_ref->peer.cellid = ORTE_CELLID_INVALID;
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
}
@ -644,7 +650,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_state_t,
v_msg_ref = v_coord_state->msg_ref; \
}
#define CREATE_NEW_MSG(msg_ref, v_type, v_buffer, v_count, v_datatype, v_tag, v_rank, v_comm, v_request, p_jobid, p_vpid) \
#define CREATE_NEW_MSG(msg_ref, v_type, v_buffer, v_count, v_datatype, v_tag, v_rank, v_comm, v_request, p_cellid, p_jobid, p_vpid) \
{ \
msg_ref = OBJ_NEW(ompi_crcp_coord_pml_message_ref_t); \
msg_ref->msg_id = message_seq_num; \
@ -671,6 +677,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_state_t,
OBJ_RETAIN(msg_ref->request); \
} \
\
msg_ref->proc_name.cellid = p_cellid; \
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
}
@ -701,6 +708,7 @@ OBJ_CLASS_INSTANCE(ompi_crcp_coord_pml_state_t,
OBJ_RETAIN(msg_ref->request); \
} \
\
dup_msg_ref->proc_name.cellid = msg_ref->proc_name.cellid; \
dup_msg_ref->proc_name.jobid = msg_ref->proc_name.jobid; \
dup_msg_ref->proc_name.vpid = msg_ref->proc_name.vpid; \
}
@ -876,6 +884,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_add_procs(
for( i = 0; i < nprocs; ++i) {
new_peer_ref = OBJ_NEW(ompi_crcp_coord_pml_peer_ref_t);
new_peer_ref->proc_name.cellid = procs[i]->proc_name.cellid;
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
@ -908,8 +917,8 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_del_procs(
item = (opal_list_item_t*)find_peer(procs[i]->proc_name);
if(NULL == item) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: del_procs: Unable to find peer %s\n",
ORTE_NAME_PRINT(&(procs[i]->proc_name)));
"crcp:coord: del_procs: Unable to find peer [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(&(procs[i]->proc_name)));
exit_status = OMPI_ERROR;
goto DONE;
}
@ -972,6 +981,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_isend_init(
buf,
count, datatype, tag, dst, comm,
NULL,
peer_ref->proc_name.cellid,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
@ -1065,6 +1075,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_isend(
CREATE_NEW_MSG(msg_ref, COORD_MSG_TYPE_I_SEND,
buf,
count, datatype, tag, dst, comm, NULL,
peer_ref->proc_name.cellid,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
@ -1160,6 +1171,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_send(
CREATE_NEW_MSG(msg_ref, COORD_MSG_TYPE_B_SEND,
buf,
count, datatype, tag, dst, comm, NULL,
peer_ref->proc_name.cellid,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
@ -1252,6 +1264,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv_init(
buf,
count, datatype, tag, src, comm,
NULL, /* Leave this NULL for now, will pick up real value in POST */
ORTE_CELLID_INVALID,
ORTE_JOBID_INVALID,
ORTE_VPID_INVALID);
@ -1283,6 +1296,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv_init(
goto DONE;
}
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
@ -1438,6 +1452,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv(
buf,
count, datatype, tag, src, comm,
NULL, /* Leave this NULL for now, will pick up real value in POST */
ORTE_CELLID_INVALID,
ORTE_JOBID_INVALID,
ORTE_VPID_INVALID);
@ -1469,6 +1484,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_irecv(
goto DONE;
}
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
@ -1640,6 +1656,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
buf,
count, datatype, tag, src,
comm, request,
ORTE_CELLID_INVALID,
ORTE_JOBID_INVALID,
ORTE_VPID_INVALID);
@ -1671,6 +1688,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
goto DONE;
}
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
@ -1704,6 +1722,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
goto DONE;
}
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
@ -1840,6 +1859,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
CREATE_NEW_MSG(msg_ref, COORD_MSG_TYPE_B_RECV,
buf,
count, datatype, tag, src, comm, NULL,
ORTE_CELLID_INVALID,
ORTE_JOBID_INVALID,
ORTE_VPID_INVALID);
@ -1875,6 +1895,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
goto DONE;
}
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
@ -1921,6 +1942,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_coord_pml_recv(
goto DONE;
}
msg_ref->proc_name.cellid = peer_ref->proc_name.cellid;
msg_ref->proc_name.jobid = peer_ref->proc_name.jobid;
msg_ref->proc_name.vpid = peer_ref->proc_name.vpid;
@ -2998,8 +3020,8 @@ static int ft_event_coordinate_peers(void)
*/
if( stall_for_completion ) {
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: ft_event_coordinate_peers: %s **** STALLING ***",
ORTE_NAME_PRINT(orte_process_info.my_name));
"crcp:coord: ft_event_coordinate_peers: [%lu,%lu,%lu] **** STALLING ***",
ORTE_NAME_ARGS(orte_process_info.my_name));
step_to_return_to = 1;
exit_status = OMPI_SUCCESS;
goto DONE;
@ -3036,8 +3058,8 @@ static int ft_event_coordinate_peers(void)
}
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
"crcp:coord: ft_event_coordinate_peers: %s Coordination Finished...\n",
ORTE_NAME_PRINT(orte_process_info.my_name) );
"crcp:coord: ft_event_coordinate_peers: [%lu,%lu,%lu] Coordination Finished...\n",
ORTE_NAME_ARGS(orte_process_info.my_name) );
/*
* Now that all our peer channels are marked as drained
@ -3154,11 +3176,11 @@ static int ft_event_check_bookmarks(void)
if( 10 <= mca_crcp_coord_component.super.verbose ) {
sleep(orte_process_info.my_name->vpid);
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"Process %s Match Table",
ORTE_NAME_PRINT(orte_process_info.my_name));
"Process [%lu,%lu,%lu] Match Table",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"%s %5s | %7s | %7s | %7s | %7s |",
ORTE_NAME_PRINT(orte_process_info.my_name),
"[%lu,%lu,%lu] %5s | %7s | %7s | %7s | %7s |",
ORTE_NAME_ARGS(orte_process_info.my_name),
"Vpid", "T_Send", "M_Recv", "M_Send", "T_Recv");
for(item = opal_list_get_first(&ompi_crcp_coord_pml_peer_refs);
@ -3183,8 +3205,8 @@ static int ft_event_check_bookmarks(void)
peer_ref->matched_recv_init_msgs );
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"%s %5d | %7d | %7d | %7d | %7d |",
ORTE_NAME_PRINT(orte_process_info.my_name),
"[%lu,%lu,%lu] %5d | %7d | %7d | %7d | %7d |",
ORTE_NAME_ARGS(orte_process_info.my_name),
peer_ref->proc_name.vpid,
t_send, m_recv, m_send, t_recv);
}
@ -3223,11 +3245,11 @@ static int ft_event_check_bookmarks(void)
/* T_Send >= M_Recv */
if( p_n_to_p_m < p_n_from_p_m ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s --> %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
" WARNING: Peer received more than was sent. :(\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3238,10 +3260,10 @@ static int ft_event_check_bookmarks(void)
* so need to coordinate with peer. */
if( p_n_to_p_m > p_n_from_p_m) {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s --> %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
"Sent Msgs (%4d) = Received Msgs (%4d). Peer needs %4d.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3253,8 +3275,8 @@ static int ft_event_check_bookmarks(void)
*/
if( OMPI_SUCCESS != (ret = send_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: Unable to send message details to peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_ref->proc_name),
"crcp:coord: check_bookmarks: Unable to send message details to peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_ref->proc_name),
ret);
return ret;
}
@ -3274,11 +3296,11 @@ static int ft_event_check_bookmarks(void)
/* M_Send >= T_Recv */
if( p_n_to_p_m < p_n_from_p_m ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s --> %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
" WARNING: I received more than the peer sent. :(\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3289,10 +3311,10 @@ static int ft_event_check_bookmarks(void)
* so need to coordinate with peer. */
if( p_n_to_p_m > p_n_from_p_m) {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s <-- %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
"Received Msgs (%4d) = Sent Msgs (%4d). I need %4d.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3303,8 +3325,8 @@ static int ft_event_check_bookmarks(void)
*/
if( OMPI_SUCCESS != (ret = recv_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: Unable to recv message details from peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_ref->proc_name),
"crcp:coord: check_bookmarks: Unable to recv message details from peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_ref->proc_name),
ret);
return ret;
}
@ -3326,11 +3348,11 @@ static int ft_event_check_bookmarks(void)
/* M_Send >= T_Recv */
if( p_n_to_p_m < p_n_from_p_m ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s --> %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
" WARNING: I received more than the peer sent. :(\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3341,10 +3363,10 @@ static int ft_event_check_bookmarks(void)
* so need to coordinate with peer. */
if( p_n_to_p_m > p_n_from_p_m) {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s <-- %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
"Received Msgs (%4d) = Sent Msgs (%4d). I need %4d.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3355,8 +3377,8 @@ static int ft_event_check_bookmarks(void)
*/
if( OMPI_SUCCESS != (ret = recv_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: Unable to recv message details from peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_ref->proc_name),
"crcp:coord: check_bookmarks: Unable to recv message details from peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_ref->proc_name),
ret);
return ret;
}
@ -3376,11 +3398,11 @@ static int ft_event_check_bookmarks(void)
/* T_Send >= M_Recv */
if( p_n_to_p_m < p_n_from_p_m ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s --> %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
"Sent Msgs (%4d) = Received Msgs (%4d) => Diff (%4d). "
" WARNING: Peer received more than was sent. :(\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3391,10 +3413,10 @@ static int ft_event_check_bookmarks(void)
* so need to coordinate with peer. */
if( p_n_to_p_m > p_n_from_p_m) {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: %s --> %s "
"crcp:coord: check_bookmarks: [%lu,%lu,%lu] --> [%lu,%lu,%lu] "
"Sent Msgs (%4d) = Received Msgs (%4d). Peer needs %4d.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
p_n_to_p_m,
p_n_from_p_m,
(p_n_to_p_m - p_n_from_p_m)
@ -3406,8 +3428,8 @@ static int ft_event_check_bookmarks(void)
*/
if( OMPI_SUCCESS != (ret = send_msg_details(peer_ref, p_n_to_p_m, p_n_from_p_m) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: check_bookmarks: Unable to send message details to peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_ref->proc_name),
"crcp:coord: check_bookmarks: Unable to send message details to peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_ref->proc_name),
ret);
return ret;
}
@ -3440,8 +3462,8 @@ static int ft_event_post_drain_acks(void)
}
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: post_drain_ack: %s Wait on %d Drain ACK Messages.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
"crcp:coord: post_drain_ack: [%lu,%lu,%lu] Wait on %d Drain ACK Messages.\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
(int)req_size);
/*
@ -3460,8 +3482,8 @@ static int ft_event_post_drain_acks(void)
drain_message_ack_cbfunc,
NULL) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: post_drain_acks: %s Failed to post a RML receive to the peer\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
"crcp:coord: post_drain_acks: [%lu,%lu,%lu] Failed to post a RML receive to the peer\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
return ret;
}
}
@ -3496,23 +3518,24 @@ static void drain_message_ack_cbfunc(int status,
/* If this ACK has not completed yet */
if(!drain_msg_ack->complete) {
/* If it is the correct peer */
if(drain_msg_ack->peer.jobid == sender->jobid &&
if(drain_msg_ack->peer.cellid == sender->cellid &&
drain_msg_ack->peer.jobid == sender->jobid &&
drain_msg_ack->peer.vpid == sender->vpid ) {
/* We found it! */
drain_msg_ack->complete = true;
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
"crcp:coord: drain_message_ack_cbfunc: %s --> %s Received ACK of FLUSH from peer\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(sender) );
"crcp:coord: drain_message_ack_cbfunc: [%lu,%lu,%lu] --> [%lu,%lu,%lu] Received ACK of FLUSH from peer\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender) );
return;
}
}
}
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: drain_message_ack_cbfunc: %s --> %s ERROR: Uable to match ACK to peer\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(sender) );
"crcp:coord: drain_message_ack_cbfunc: [%lu,%lu,%lu] --> [%lu,%lu,%lu] ERROR: Uable to match ACK to peer\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(sender) );
cleanup:
return;
@ -3530,8 +3553,8 @@ static int ft_event_post_drained(void)
}
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: post_drained: %s Draining %d Messages.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
"crcp:coord: post_drained: [%lu,%lu,%lu] Draining %d Messages.\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
(int)req_size);
/*
@ -3551,8 +3574,8 @@ static int ft_event_post_drained(void)
*/
if( drain_msg->already_posted ) {
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: post_drained: %s Found a message that we don't need to post.\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
"crcp:coord: post_drained: [%lu,%lu,%lu] Found a message that we don't need to post.\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
continue;
}
/*
@ -3560,8 +3583,8 @@ static int ft_event_post_drained(void)
*/
else {
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: post_drained: %s Posting a message to be drained from %d.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
"crcp:coord: post_drained: [%lu,%lu,%lu] Posting a message to be drained from %d.\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
drain_msg->rank);
if( OMPI_SUCCESS != (ret = wrapped_pml_module->pml_irecv(drain_msg->buffer,
(drain_msg->count * drain_msg->ddt_size),
@ -3571,8 +3594,8 @@ static int ft_event_post_drained(void)
drain_msg->comm,
&(drain_msg->request) ) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: post_drained: %s Failed to post the Draining PML iRecv\n",
ORTE_NAME_PRINT(orte_process_info.my_name) );
"crcp:coord: post_drained: [%lu,%lu,%lu] Failed to post the Draining PML iRecv\n",
ORTE_NAME_ARGS(orte_process_info.my_name) );
return ret;
}
}
@ -3591,8 +3614,8 @@ static int ft_event_wait_quiesce(void)
**********************************************/
if( OMPI_SUCCESS != (ret = wait_quiesce_drained() ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce: %s Failed to quiesce drained messages\n",
ORTE_NAME_PRINT(orte_process_info.my_name) );
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] Failed to quiesce drained messages\n",
ORTE_NAME_ARGS(orte_process_info.my_name) );
exit_status = ret;
goto cleanup;
}
@ -3602,8 +3625,8 @@ static int ft_event_wait_quiesce(void)
*******************************************************************/
if( OMPI_SUCCESS != (ret = wait_quiesce_drain_ack() ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce: %s Failed to recv all drain ACKs\n",
ORTE_NAME_PRINT(orte_process_info.my_name) );
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] Failed to recv all drain ACKs\n",
ORTE_NAME_ARGS(orte_process_info.my_name) );
exit_status = ret;
goto cleanup;
}
@ -3635,8 +3658,8 @@ static int wait_quiesce_drained(void)
}
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce_drained: %s Waiting on %d messages to drain\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
"crcp:coord: wait_quiesce_drained: [%lu,%lu,%lu] Waiting on %d messages to drain\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
(int)req_size);
/*
@ -3667,6 +3690,7 @@ static int wait_quiesce_drained(void)
wait_any_requests[i] = &ompi_request_null;
wait_any_status[i] = &ompi_status_empty;
proc_names[i].cellid = ORTE_CELLID_INVALID;
proc_names[i].jobid = ORTE_JOBID_INVALID;
proc_names[i].vpid = ORTE_VPID_INVALID;
}
@ -3690,15 +3714,15 @@ static int wait_quiesce_drained(void)
*/
if( drain_msg->already_posted && NULL == drain_msg->request) {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce_drained: %s - %s Already posted this msg.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(drain_msg->proc_name)) );
"crcp:coord: wait_quiesce_drained: [%lu,%lu,%lu] - [%lu,%lu,%lu] Already posted this msg.\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(drain_msg->proc_name)) );
}
else {
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce_drained: %s - %s Waiting on message. (index = %d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(drain_msg->proc_name)),
"crcp:coord: wait_quiesce_drained: [%lu,%lu,%lu] - [%lu,%lu,%lu] Waiting on message. (index = %d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(drain_msg->proc_name)),
(int)wait_any_count);
wait_any_requests[wait_any_count] = drain_msg->request;
@ -3712,7 +3736,8 @@ static int wait_quiesce_drained(void)
/* Add proc to response queue if it is not already there */
found = false;
for(i = 0; i < last_proc_idx; ++i) {
if(proc_names[i].jobid == drain_msg->proc_name.jobid &&
if(proc_names[i].cellid == drain_msg->proc_name.cellid &&
proc_names[i].jobid == drain_msg->proc_name.jobid &&
proc_names[i].vpid == drain_msg->proc_name.vpid ) {
found = true;
break;
@ -3720,11 +3745,12 @@ static int wait_quiesce_drained(void)
}
if( !found ) {
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce: %s - %s Add process to response list [idx %d]\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(drain_msg->proc_name)),
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] - [%lu,%lu,%lu] Add process to response list [idx %d]\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(drain_msg->proc_name)),
(int)last_proc_idx);
proc_names[last_proc_idx].cellid = drain_msg->proc_name.cellid;
proc_names[last_proc_idx].jobid = drain_msg->proc_name.jobid;
proc_names[last_proc_idx].vpid = drain_msg->proc_name.vpid;
last_proc_idx++;
@ -3748,8 +3774,8 @@ static int wait_quiesce_drained(void)
* Send ACKs to all peers
*/
opal_output_verbose(5, mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce: %s Send ACKs to all Peers\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
"crcp:coord: wait_quiesce: [%lu,%lu,%lu] Send ACKs to all Peers\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
for(i = 0; i < last_proc_idx; ++i) {
orte_buffer_t *buffer = NULL;
@ -3828,8 +3854,8 @@ static int coord_request_wait_all( size_t count,
coord_request_wait(req, status);
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: request_wait_all: %s Done with idx %d of %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
"crcp:coord: request_wait_all: [%lu,%lu,%lu] Done with idx %d of %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
(int)i, (int)count);
}
@ -3871,8 +3897,8 @@ static int wait_quiesce_drain_ack(void)
}
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: wait_quiesce_drain_ack: %s Waiting on %d Drain ACK messages\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
"crcp:coord: wait_quiesce_drain_ack: [%lu,%lu,%lu] Waiting on %d Drain ACK messages\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
num_outstanding);
while(0 < num_outstanding) {
@ -3913,6 +3939,7 @@ static int send_bookmarks(int peer_idx)
/*
* Find the peer structure for this peer
*/
peer_name.cellid = orte_process_info.my_name->cellid;
peer_name.jobid = orte_process_info.my_name->jobid;
peer_name.vpid = peer_idx;
@ -3925,9 +3952,9 @@ static int send_bookmarks(int peer_idx)
}
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: send_bookmarks: %s -> %s Sending bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&peer_name),
"crcp:coord: send_bookmarks: [%lu,%lu,%lu] -> [%lu,%lu,%lu] Sending bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&peer_name),
peer_ref->total_send_msgs,
peer_ref->total_isend_msgs,
peer_ref->total_send_init_msgs,
@ -3959,8 +3986,8 @@ static int send_bookmarks(int peer_idx)
if ( 0 > ( ret = orte_rml.send_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: send_bookmarks: Failed to send bookmark to peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_name),
"crcp:coord: send_bookmarks: Failed to send bookmark to peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_name),
ret);
exit_status = ret;
goto cleanup;
@ -3986,6 +4013,7 @@ static int recv_bookmarks(int peer_idx)
/*
* Find the peer structure for this peer
*/
peer_name.cellid = orte_process_info.my_name->cellid;
peer_name.jobid = orte_process_info.my_name->jobid;
peer_name.vpid = peer_idx;
@ -4007,8 +4035,8 @@ static int recv_bookmarks(int peer_idx)
if ( 0 > (ret = orte_rml.recv_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_name),
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_name),
ret);
exit_status = ret;
goto cleanup;
@ -4035,9 +4063,9 @@ static int recv_bookmarks(int peer_idx)
peer_ref->matched_recv_init_msgs = tmp_int;
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_bookmarks: %s <- %s Received bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&peer_name),
"crcp:coord: recv_bookmarks: [%lu,%lu,%lu] <- [%lu,%lu,%lu] Received bookmark S[%4d,%4d,%4d] R[%4d,%4d,%4d]\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&peer_name),
peer_ref->matched_send_msgs,
peer_ref->matched_isend_msgs,
peer_ref->matched_send_init_msgs,
@ -4091,9 +4119,9 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
found_match = false;
if(OMPI_SUCCESS != (ret = do_send_msg_detail(peer_ref, msg_ref, &found_match, &finished)) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: send_msg_details: %s --> %s Failed to send message details to peer. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
"crcp:coord: send_msg_details: [%lu,%lu,%lu] --> [%lu,%lu,%lu] Failed to send message details to peer. Return %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
}
if(found_match) {
@ -4137,14 +4165,15 @@ static int send_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
* inflight messages into a local buffer
*/
d_msg_ack = OBJ_NEW(drain_msg_ack_ref_t);
d_msg_ack->peer.cellid = peer_ref->proc_name.cellid;
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
d_msg_ack->complete = false;
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: send_msg_details: %s <--> %s Will wait on ACK from this peer.\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)));
"crcp:coord: send_msg_details: [%lu,%lu,%lu] <--> [%lu,%lu,%lu] Will wait on ACK from this peer.\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)));
/*
* If we know that we are in the middle of a blocking send/recv then we
@ -4214,8 +4243,8 @@ static int do_send_msg_detail(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
if ( 0 > ( ret = orte_rml.send_buffer(&peer_ref->proc_name, buffer,
OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: do_send_msg_detail: Unable to send message details to peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_ref->proc_name),
"crcp:coord: do_send_msg_detail: Unable to send message details to peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_ref->proc_name),
ret);
exit_status = OMPI_ERROR;
@ -4241,9 +4270,9 @@ static int do_send_msg_detail(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
if ( 0 > (ret = orte_rml.recv_buffer(&peer_ref->proc_name, buffer,
OMPI_CRCP_COORD_BOOKMARK_TAG) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: do_send_msg_detail: %s --> %s Failed to receive ACK buffer from peer. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
"crcp:coord: do_send_msg_detail: [%lu,%lu,%lu] --> [%lu,%lu,%lu] Failed to receive ACK buffer from peer. Return %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
exit_status = ret;
goto cleanup;
@ -4312,10 +4341,10 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
&p_tag, &p_count,
&p_datatype_size)) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_details: %s <-- %s "
"crcp:coord: recv_msg_details: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
"Failed to receive message detail from peer. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
exit_status = ret;
goto cleanup;
@ -4332,10 +4361,10 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
p_datatype_size,
&found_match) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_details: %s <-- %s "
"crcp:coord: recv_msg_details: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] "
"Failed to check message detail from peer. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
exit_status = ret;
goto cleanup;
@ -4360,9 +4389,9 @@ static int recv_msg_details(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
if(OMPI_SUCCESS != (ret = do_recv_msg_detail_resp(peer_ref, response))) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_details: %s <-- %s Failed to respond to peer. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
"crcp:coord: recv_msg_details: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] Failed to respond to peer. Return %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
exit_status = ret;
goto cleanup;
@ -4392,9 +4421,9 @@ static int do_recv_msg_detail(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
*/
if ( 0 > (ret = orte_rml.recv_buffer(&peer_ref->proc_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: do_recv_msg_detail: %s <-- %s Failed to receive buffer from peer. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
"crcp:coord: do_recv_msg_detail: [%lu,%lu,%lu] <-- [%lu,%lu,%lu] Failed to receive buffer from peer. Return %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
exit_status = ret;
goto cleanup;
@ -4453,20 +4482,20 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
&msg_already_posted); /* Has the recv already been posted? */
if( OMPI_SUCCESS != ret) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: %s -- %s "
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] -- [%lu,%lu,%lu] "
"Failed to determine if we have received this message. Return %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
ret);
exit_status = ret;
goto cleanup;
}
opal_output_verbose(20, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: %s -- %s"
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] -- [%lu,%lu,%lu]"
" found %s, complete %s, posted %s, peer_rank=[%d vs %d]\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer_ref->proc_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer_ref->proc_name)),
(true == msg_found ? "True " : "False"),
(true == msg_complete ? "True " : "False"),
(true == msg_already_posted ? "True " : "False"),
@ -4482,8 +4511,8 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
ompi_crcp_coord_pml_message_ref_t *d_msg = NULL;
opal_output_verbose(15, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: %s Found a message that needs to be drained\n",
ORTE_NAME_PRINT(orte_process_info.my_name) );
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] Found a message that needs to be drained\n",
ORTE_NAME_ARGS(orte_process_info.my_name) );
/*
* Construct a message for draining
@ -4493,6 +4522,7 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
0, NULL, /* Setup the datatype outside of this */
tag, rank, ompi_comm_lookup(comm_id),
NULL,
peer_ref->proc_name.cellid,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
/*
@ -4540,9 +4570,9 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
ompi_crcp_coord_pml_message_ref_t *d_msg = NULL;
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: %s "
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] "
"Found a message already posted! Prepare to drain.\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
ORTE_NAME_ARGS(orte_process_info.my_name));
/*
* If this is the current blocking recv,
@ -4551,9 +4581,9 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
if( current_msg_id == posted_msg_ref->msg_id &&
COORD_MSG_TYPE_B_RECV == posted_msg_ref->msg_type) {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: %s "
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] "
"Found a message already posted! Prepare to STALL.\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
ORTE_NAME_ARGS(orte_process_info.my_name));
stall_for_completion = true;
}
/*
@ -4562,9 +4592,9 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
*/
else {
opal_output_verbose(10, mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: %s "
"crcp:coord: recv_msg_detail_check: [%lu,%lu,%lu] "
"Found a message already posted! No stall required [%3d, %3d, %3d, %3d].\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_ARGS(orte_process_info.my_name),
(int)current_msg_id,
(int)current_msg_type,
(int)posted_msg_ref->msg_id,
@ -4596,6 +4626,7 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
count, NULL,
tag, rank, ompi_comm_lookup(comm_id),
posted_msg_ref->request,
peer_ref->proc_name.cellid,
peer_ref->proc_name.jobid,
peer_ref->proc_name.vpid);
@ -4611,8 +4642,8 @@ static int do_recv_msg_detail_check(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
}
else {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_check: ***** ERROR ***** %s Failed to find an action to use. This should never happen!\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
"crcp:coord: recv_msg_detail_check: ***** ERROR ***** [%lu,%lu,%lu] Failed to find an action to use. This should never happen!\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
exit_status = OMPI_ERROR;
goto cleanup;
}
@ -4898,8 +4929,8 @@ static int do_recv_msg_detail_resp(ompi_crcp_coord_pml_peer_ref_t *peer_ref,
if ( 0 > ( ret = orte_rml.send_buffer(&peer_ref->proc_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_msg_detail_resp: Unable to send message detail response to peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_ref->proc_name),
"crcp:coord: recv_msg_detail_resp: Unable to send message detail response to peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_ref->proc_name),
ret);
exit_status = OMPI_ERROR;
goto cleanup;
@ -4957,6 +4988,7 @@ static int coord_basic_barrier_send(int peer_idx)
/*
* Find the peer structure for this peer
*/
peer_name.cellid = orte_process_info.my_name->cellid;
peer_name.jobid = orte_process_info.my_name->jobid;
peer_name.vpid = peer_idx;
@ -4974,8 +5006,8 @@ static int coord_basic_barrier_send(int peer_idx)
/* JJH -- Really Establish TAG in rml_types.h */
if ( 0 > ( ret = orte_rml.send_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG+1, 0)) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: coord_basic_barrier_send: Failed to send ACK to peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_name),
"crcp:coord: coord_basic_barrier_send: Failed to send ACK to peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_name),
ret);
exit_status = ret;
goto cleanup;
@ -5001,6 +5033,7 @@ static int coord_basic_barrier_recv(int peer_idx)
/*
* Find the peer structure for this peer
*/
peer_name.cellid = orte_process_info.my_name->cellid;
peer_name.jobid = orte_process_info.my_name->jobid;
peer_name.vpid = peer_idx;
@ -5014,8 +5047,8 @@ static int coord_basic_barrier_recv(int peer_idx)
if ( 0 > (ret = orte_rml.recv_buffer(&peer_name, buffer, OMPI_CRCP_COORD_BOOKMARK_TAG+1) ) ) {
opal_output(mca_crcp_coord_component.super.output_handle,
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer %s: Return %d\n",
ORTE_NAME_PRINT(&peer_name),
"crcp:coord: recv_bookmarks: Failed to receive bookmark from peer [%lu,%lu,%lu]: Return %d\n",
ORTE_NAME_ARGS(&peer_name),
ret);
exit_status = ret;
goto cleanup;

Просмотреть файл

@ -400,9 +400,9 @@ int mca_mpool_rdma_release_memory(struct mca_mpool_base_module_t *mpool,
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
opal_output(0, "%s rdma: stats "
opal_output(0, "[%lu,%lu,%lu] rdma: stats "
"(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_ARGS(orte_process_info.my_name),
mpool_rdma->stat_cache_hit, mpool_rdma->stat_cache_miss,
mpool_rdma->stat_cache_found, mpool_rdma->stat_cache_notfound,
mpool_rdma->stat_evicted);

Просмотреть файл

@ -318,14 +318,14 @@ mca_pml_base_pml_check_selected(const char *my_pml,
if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) {
if (procs[i]->proc_hostname) {
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name),
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] on %s selected pml %s",
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
procs[i]->proc_hostname, remote_pml);
} else {
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name),
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] selected pml %s",
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
remote_pml);
}
return OMPI_ERR_UNREACH;

Просмотреть файл

@ -292,7 +292,7 @@ ompi_proc_t * ompi_proc_find ( const orte_process_name_t * name )
orte_ns_cmp_bitmask_t mask;
/* return the proc-struct which matches this jobid+process id */
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
OPAL_THREAD_LOCK(&ompi_proc_lock);
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
@ -315,7 +315,7 @@ ompi_proc_find_and_add(const orte_process_name_t * name, bool* isnew)
orte_ns_cmp_bitmask_t mask;
/* return the proc-struct which matches this jobid+process id */
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
OPAL_THREAD_LOCK(&ompi_proc_lock);
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
@ -550,7 +550,7 @@ static void callback(orte_gpr_notify_data_t *data, void *cbdata)
OPAL_THREAD_LOCK(&ompi_proc_lock);
/* loop over the data returned in the subscription */
mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
mask = ORTE_NS_CMP_CELLID | ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
value = (orte_gpr_value_t**)(data->values)->addr;
for (i = 0, k=0; k < data->cnt &&
i < (data->values)->size; ++i) {

Просмотреть файл

@ -806,8 +806,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
ompi_mpi_initialized = true;
if (orte_debug_flag) {
opal_output(0, "%s ompi_mpi_init completed",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] ompi_mpi_init completed",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* Do we need to wait for a TotalView-like debugger? */

Просмотреть файл

@ -46,7 +46,7 @@ static OBJ_CLASS_INSTANCE(
NULL);
#define GET_KEY(proc) \
( (((uint32_t) proc->jobid) << 24) + ((uint32_t) proc->vpid) )
( (((uint32_t) proc->cellid) << 24) + (((uint32_t) proc->jobid) << 16) + ((uint32_t) proc->vpid) )
void* orte_hash_table_get_proc(opal_hash_table_t* ht,
const orte_process_name_t* proc)

Просмотреть файл

@ -92,44 +92,48 @@ typedef void* orte_iov_base_ptr_t;
#define ORTE_NAME (orte_data_type_t) 22 /**< an orte_process_name_t */
#define ORTE_VPID (orte_data_type_t) 23 /**< a vpid */
#define ORTE_JOBID (orte_data_type_t) 24 /**< a jobid */
#define ORTE_NODEID (orte_data_type_t) 25 /**< a node id */
#define ORTE_PSET (orte_data_type_t) 25 /**< a process set */
#define ORTE_CELLID (orte_data_type_t) 26 /**< a cellid */
#define ORTE_NODEID (orte_data_type_t) 27 /**< a node id */
/* SMR types */
#define ORTE_NODE_STATE (orte_data_type_t) 26 /**< node status flag */
#define ORTE_PROC_STATE (orte_data_type_t) 27 /**< process/resource status */
#define ORTE_JOB_STATE (orte_data_type_t) 28 /**< job status flag */
#define ORTE_EXIT_CODE (orte_data_type_t) 29 /**< process exit code */
#define ORTE_NODE_STATE (orte_data_type_t) 28 /**< node status flag */
#define ORTE_PROC_STATE (orte_data_type_t) 29 /**< process/resource status */
#define ORTE_PSET_STATE (orte_data_type_t) 30 /**< process set state */
#define ORTE_JOB_STATE (orte_data_type_t) 31 /**< job status flag */
#define ORTE_EXIT_CODE (orte_data_type_t) 32 /**< process exit code */
/* GPR types */
#define ORTE_GPR_KEYVAL (orte_data_type_t) 30 /**< registry key-value pair */
#define ORTE_GPR_NOTIFY_ACTION (orte_data_type_t) 31 /**< registry notify action */
#define ORTE_GPR_TRIGGER_ACTION (orte_data_type_t) 32 /**< registry trigger action */
#define ORTE_GPR_CMD (orte_data_type_t) 33 /**< registry command */
#define ORTE_GPR_SUBSCRIPTION_ID (orte_data_type_t) 34 /**< registry notify id tag */
#define ORTE_GPR_TRIGGER_ID (orte_data_type_t) 35 /**< registry notify id tag */
#define ORTE_GPR_VALUE (orte_data_type_t) 36 /**< registry return value */
#define ORTE_GPR_ADDR_MODE (orte_data_type_t) 37 /**< Addressing mode for registry cmds */
#define ORTE_GPR_SUBSCRIPTION (orte_data_type_t) 38 /**< describes data returned by subscription */
#define ORTE_GPR_TRIGGER (orte_data_type_t) 39 /**< describes trigger conditions */
#define ORTE_GPR_NOTIFY_DATA (orte_data_type_t) 40 /**< data returned from a subscription */
#define ORTE_GPR_NOTIFY_MSG (orte_data_type_t) 41 /**< notify message containing notify_data objects */
#define ORTE_GPR_NOTIFY_MSG_TYPE (orte_data_type_t) 42 /**< notify message type (subscription or trigger) */
#define ORTE_GPR_SEARCH (orte_data_type_t) 43 /**< search criteria */
#define ORTE_GPR_UPDATE (orte_data_type_t) 44 /**< update data on the registry */
#define ORTE_GPR_KEYVAL (orte_data_type_t) 33 /**< registry key-value pair */
#define ORTE_GPR_NOTIFY_ACTION (orte_data_type_t) 34 /**< registry notify action */
#define ORTE_GPR_TRIGGER_ACTION (orte_data_type_t) 35 /**< registry trigger action */
#define ORTE_GPR_CMD (orte_data_type_t) 36 /**< registry command */
#define ORTE_GPR_SUBSCRIPTION_ID (orte_data_type_t) 37 /**< registry notify id tag */
#define ORTE_GPR_TRIGGER_ID (orte_data_type_t) 38 /**< registry notify id tag */
#define ORTE_GPR_VALUE (orte_data_type_t) 39 /**< registry return value */
#define ORTE_GPR_ADDR_MODE (orte_data_type_t) 40 /**< Addressing mode for registry cmds */
#define ORTE_GPR_SUBSCRIPTION (orte_data_type_t) 41 /**< describes data returned by subscription */
#define ORTE_GPR_TRIGGER (orte_data_type_t) 42 /**< describes trigger conditions */
#define ORTE_GPR_NOTIFY_DATA (orte_data_type_t) 43 /**< data returned from a subscription */
#define ORTE_GPR_NOTIFY_MSG (orte_data_type_t) 44 /**< notify message containing notify_data objects */
#define ORTE_GPR_NOTIFY_MSG_TYPE (orte_data_type_t) 45 /**< notify message type (subscription or trigger) */
#define ORTE_GPR_SEARCH (orte_data_type_t) 46 /**< search criteria */
#define ORTE_GPR_UPDATE (orte_data_type_t) 47 /**< update data on the registry */
/* Resource Manager types */
#define ORTE_APP_CONTEXT (orte_data_type_t) 45 /**< argv and enviro arrays */
#define ORTE_APP_CONTEXT_MAP (orte_data_type_t) 46 /**< application context mapping array */
#define ORTE_NODE_DESC (orte_data_type_t) 47 /**< describes capabilities of nodes */
#define ORTE_SLOT_DESC (orte_data_type_t) 48 /**< describes slot allocations/reservations */
#define ORTE_RAS_NODE (orte_data_type_t) 49 /**< node information */
#define ORTE_JOB_MAP (orte_data_type_t) 50 /**< map of process locations */
#define ORTE_MAPPED_PROC (orte_data_type_t) 51 /**< process entry on map */
#define ORTE_MAPPED_NODE (orte_data_type_t) 52 /**< node entry on map */
#define ORTE_ATTRIBUTE (orte_data_type_t) 53 /**< attribute used to control framework behavior */
#define ORTE_ATTR_LIST (orte_data_type_t) 54 /**< list of attributes */
#define ORTE_APP_CONTEXT (orte_data_type_t) 48 /**< argv and enviro arrays */
#define ORTE_APP_CONTEXT_MAP (orte_data_type_t) 49 /**< application context mapping array */
#define ORTE_NODE_DESC (orte_data_type_t) 50 /**< describes capabilities of nodes */
#define ORTE_CELL_DESC (orte_data_type_t) 51 /**< describe attributes of cells */
#define ORTE_SLOT_DESC (orte_data_type_t) 52 /**< describes slot allocations/reservations */
#define ORTE_RAS_NODE (orte_data_type_t) 53 /**< node information */
#define ORTE_JOB_MAP (orte_data_type_t) 54 /**< map of process locations */
#define ORTE_MAPPED_PROC (orte_data_type_t) 55 /**< process entry on map */
#define ORTE_MAPPED_NODE (orte_data_type_t) 56 /**< node entry on map */
#define ORTE_ATTRIBUTE (orte_data_type_t) 57 /**< attribute used to control framework behavior */
#define ORTE_ATTR_LIST (orte_data_type_t) 58 /**< list of attributes */
/* RML types */
#define ORTE_RML_TAG (orte_data_type_t) 55 /**< tag for sending/receiving messages */
#define ORTE_RML_TAG (orte_data_type_t) 59 /**< tag for sending/receiving messages */
/* DAEMON communication type */
#define ORTE_DAEMON_CMD (orte_data_type_t) 56 /**< command flag for communicating with the daemon */
#define ORTE_DAEMON_CMD (orte_data_type_t) 60 /**< command flag for communicating with the daemon */
/* Need a command separate from ORTE_DAEMON_CMD, so that we can receive on
* them both at the same time */

Просмотреть файл

@ -42,9 +42,14 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
return;
}
opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_ERROR_NAME(error_code), filename, line);
if (NULL == orte_process_info.my_name) {
opal_output(0, "[NO-NAME] ORTE_ERROR_LOG: %s in file %s at line %d",
ORTE_ERROR_NAME(error_code), filename, line);
} else {
opal_output(0, "[%lu,%lu,%lu] ORTE_ERROR_LOG: %s in file %s at line %d",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_ERROR_NAME(error_code), filename, line);
}
}
int orte_errmgr_base_proc_aborted_not_avail(orte_gpr_notify_message_t *msg)

Просмотреть файл

@ -154,8 +154,8 @@ orte_errmgr_bproc_component_init(bool *allow_multi_user_threads, bool *have_hidd
int orte_errmgr_bproc_finalize(void)
{
if (orte_errmgr_bproc_globals.debug) {
opal_output(0, "%s errmgr_bproc_finalize called",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] errmgr_bproc_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
initialized = false;

Просмотреть файл

@ -159,8 +159,8 @@ int orte_errmgr_hnp_finalize(void)
int rc;
if (orte_errmgr_hnp_globals.debug) {
opal_output(0, "%s errmgr_hnp_finalize called",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] errmgr_hnp_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* stop the receive function */

Просмотреть файл

@ -154,8 +154,8 @@ orte_errmgr_orted_component_init(bool *allow_multi_user_threads, bool *have_hidd
int orte_errmgr_orted_finalize(void)
{
if (orte_errmgr_orted_globals.debug) {
opal_output(0, "%s errmgr_orted_finalize called",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] errmgr_orted_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
initialized = false;

Просмотреть файл

@ -153,8 +153,8 @@ orte_errmgr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidd
int orte_errmgr_proxy_finalize(void)
{
if (orte_errmgr_proxy_globals.debug) {
opal_output(0, "%s errmgr_proxy_finalize called",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] errmgr_proxy_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
initialized = false;

Просмотреть файл

@ -155,7 +155,8 @@ int orte_filem_base_get_proc_node_name(orte_process_name_t *proc, char **machine
* Contact GPR and get the 'orte-node-name' for this process
*/
/* if it is the root then we need a different key :/ */
if(proc->jobid == 0 &&
if(proc->cellid == 0 &&
proc->jobid == 0 &&
proc->vpid == 0) {
keys[0] = ORTE_PROC_RML_IP_ADDRESS_KEY;
}
@ -360,9 +361,9 @@ void orte_filem_base_query_callback(int status,
}
opal_output_verbose(10, orte_filem_base_output,
"filem:base: filem_base_query_callback: %s -> %s: Filename Requested (%s) translated to (%s)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(peer),
"filem:base: filem_base_query_callback: [%lu,%lu,%lu] -> [%lu,%lu,%lu]: Filename Requested (%s) translated to (%s)",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(peer),
filename, tmp_name);
/*

Просмотреть файл

@ -426,9 +426,9 @@ static void orte_filem_rsh_query_callback(int status,
void* cbdata)
{
opal_output_verbose(10, mca_filem_rsh_component.super.output_handle,
"filem:rsh: query_callback(%s -> %s)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(peer));
"filem:rsh: query_callback([%lu,%lu,%lu] -> [%lu,%lu,%lu])",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(peer));
/* Call the base callback function */
orte_filem_base_query_callback(status, peer, buffer, tag, cbdata);

Просмотреть файл

@ -229,8 +229,8 @@ orte_gpr_proxy_component_init(bool *allow_multi_user_threads, bool *have_hidden_
if (NULL != orte_process_info.gpr_replica_uri) {
if (orte_gpr_proxy_globals.debug) {
opal_output(0, "%s gpr_proxy_init: proxy selected",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_proxy_init: proxy selected",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* setup the replica location */
@ -325,8 +325,8 @@ int orte_gpr_proxy_finalize(void)
orte_gpr_proxy_trigger_t **ltrigs;
if (orte_gpr_proxy_globals.debug) {
opal_output(0, "%s gpr_proxy_finalize called",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_proxy_finalize called",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
if (initialized) {

Просмотреть файл

@ -95,7 +95,7 @@ int orte_gpr_proxy_exec_compound_cmd(orte_buffer_t *buffer)
int rc, response;
if (orte_gpr_proxy_globals.debug) {
opal_output(0, "[%ld,%ld] transmitting compound command",
opal_output(0, "[%lu,%lu,%lu] transmitting compound command",
ORTE_NAME_ARGS(orte_process_info.my_name));
}

Просмотреть файл

@ -47,7 +47,7 @@ int orte_gpr_proxy_dump_local_triggers(void)
orte_gpr_proxy_trigger_t **trigs;
orte_std_cntr_t j, k;
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for [%ld,%ld]\n",
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(orte_gpr_base_output, "Number of triggers: %lu\n", (unsigned long) orte_gpr_proxy_globals.num_trigs);
@ -72,7 +72,7 @@ int orte_gpr_proxy_dump_local_subscriptions(void)
orte_gpr_proxy_subscriber_t **subs;
orte_std_cntr_t j, k;
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for [%ld,%ld]\n",
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(orte_gpr_base_output, "Number of subscriptions: %lu\n", (unsigned long) orte_gpr_proxy_globals.num_subs);

Просмотреть файл

@ -42,8 +42,8 @@ int orte_gpr_replica_dump_all(void)
int rc;
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_dump_all: entered",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_all: entered",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
@ -74,8 +74,8 @@ int orte_gpr_replica_dump_segments(char *segment)
int rc;
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_dump_segments: entered",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_segments: entered",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
@ -106,8 +106,8 @@ int orte_gpr_replica_dump_triggers(orte_gpr_trigger_id_t start)
int rc;
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_dump_triggers: entered",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_triggers: entered",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
@ -290,8 +290,8 @@ int orte_gpr_replica_dump_callbacks(void)
int rc;
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_dump_callbacks: entered",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_dump_callbacks: entered",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);

Просмотреть файл

@ -40,8 +40,8 @@ int orte_gpr_replica_dump_local_triggers(void)
orte_gpr_replica_local_trigger_t **trigs;
orte_std_cntr_t j, k;
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for %s\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(orte_gpr_base_output, "DUMP OF LOCAL TRIGGERS for [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(orte_gpr_base_output, "Number of triggers: %lu\n", (unsigned long) orte_gpr_replica_globals.num_local_trigs);
trigs = (orte_gpr_replica_local_trigger_t**)(orte_gpr_replica_globals.local_triggers)->addr;
@ -70,8 +70,8 @@ int orte_gpr_replica_dump_local_subscriptions(void)
orte_gpr_replica_local_subscriber_t **subs;
orte_std_cntr_t j, k;
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for %s\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(orte_gpr_base_output, "DUMP OF LOCAL SUBSCRIPTIONS for [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(orte_gpr_base_output, "Number of subscriptions: %lu\n", (unsigned long) orte_gpr_replica_globals.num_local_subs);
subs = (orte_gpr_replica_local_subscriber_t**)(orte_gpr_replica_globals.local_subscriptions)->addr;

Просмотреть файл

@ -64,8 +64,8 @@ void orte_gpr_replica_recv(int status, orte_process_name_t* sender,
OPAL_TRACE(3);
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr replica: received message from %s",
ORTE_NAME_PRINT(orte_process_info.my_name), ORTE_NAME_PRINT(sender));
opal_output(0, "[%lu,%lu,%lu] gpr replica: received message from [%lu,%lu,%lu]",
ORTE_NAME_ARGS(orte_process_info.my_name), ORTE_NAME_ARGS(sender));
}
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);

Просмотреть файл

@ -96,7 +96,7 @@ int orte_gpr_replica_remote_notify(orte_process_name_t *recipient,
* process is done
*/
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
opal_output(0, "send failed to %s", ORTE_NAME_PRINT(recipient));
opal_output(0, "send failed to [%ld,%ld,%ld]", ORTE_NAME_ARGS(recipient));
orte_dss.dump(0, message, ORTE_GPR_NOTIFY_MSG);
OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);
return ORTE_ERR_COMM_FAILURE;

Просмотреть файл

@ -79,8 +79,8 @@ int orte_gpr_replica_cleanup_proc_fn(orte_process_name_t *proc)
OPAL_TRACE(2);
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_cleanup_proc: function entered for process %s",
ORTE_NAME_PRINT(orte_process_info.my_name), ORTE_NAME_PRINT(proc));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_cleanup_proc: function entered for process [%lu,%lu,%lu]",
ORTE_NAME_ARGS(orte_process_info.my_name), ORTE_NAME_ARGS(proc));
}
if (ORTE_SUCCESS != (rc = orte_ns.get_proc_name_string(&procname, proc))) {

Просмотреть файл

@ -236,8 +236,8 @@ int orte_gpr_replica_dump_callbacks_fn(orte_buffer_t *buffer)
if (NULL == cb->requestor) {
sprintf(tmp_out, "Local requestor");
} else {
sprintf(tmp_out, "Requestor: %s",
ORTE_NAME_PRINT(cb->requestor));
sprintf(tmp_out, "Requestor: [%lu,%lu,%lu]",
ORTE_NAME_ARGS(cb->requestor));
}
orte_gpr_replica_dump_load_string(buffer, &tmp_out);
orte_gpr_base_dump_notify_msg(buffer, cb->message);
@ -420,8 +420,8 @@ int orte_gpr_replica_dump_trigger(orte_buffer_t *buffer,
sprintf(tmp_out, "\t\tRequestor %lu: LOCAL@idtag %lu",
(unsigned long)j, (unsigned long)attached[i]->idtag);
} else {
sprintf(tmp_out, "\t\tRequestor %lu: %s@idtag %lu",
(unsigned long)j, ORTE_NAME_PRINT(attached[i]->requestor),
sprintf(tmp_out, "\t\tRequestor %lu: [%lu,%lu,%lu]@idtag %lu",
(unsigned long)j, ORTE_NAME_ARGS(attached[i]->requestor),
(unsigned long)attached[i]->idtag);
}
orte_gpr_replica_dump_load_string(buffer, &tmp_out);
@ -435,8 +435,8 @@ int orte_gpr_replica_dump_trigger(orte_buffer_t *buffer,
sprintf(tmp_out, "\tTRIGGER MASTER: LOCAL@idtag %lu",
(unsigned long)trig->master->idtag);
} else {
sprintf(tmp_out, "\tTRIGGER MASTER: %s@idtag %lu",
ORTE_NAME_PRINT(trig->master->requestor),
sprintf(tmp_out, "\tTRIGGER MASTER: [%lu,%lu,%lu]@idtag %lu",
ORTE_NAME_ARGS(trig->master->requestor),
(unsigned long)trig->master->idtag);
}
}
@ -612,8 +612,8 @@ int orte_gpr_replica_dump_subscription(orte_buffer_t *buffer,
sprintf(tmp_out, "\t\tRequestor: LOCAL @ subscription id %lu",
(unsigned long) reqs[j]->idtag);
} else {
sprintf(tmp_out, "\t\tRequestor: %s @ subscription id %lu",
ORTE_NAME_PRINT(reqs[j]->requestor),
sprintf(tmp_out, "\t\tRequestor: [%lu,%lu,%lu] @ subscription id %lu",
ORTE_NAME_ARGS(reqs[j]->requestor),
(unsigned long) reqs[j]->idtag);
}
orte_gpr_replica_dump_load_string(buffer, &tmp_out);

Просмотреть файл

@ -130,8 +130,8 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
if (orte_gpr_replica_globals.debug) {
char *tmp;
opal_output(0, "%s gpr_replica_put: entered on segment %s\nValues:",
ORTE_NAME_PRINT(orte_process_info.my_name), seg->name);
opal_output(0, "[%lu,%lu,%lu] gpr_replica_put: entered on segment %s\nValues:",
ORTE_NAME_ARGS(orte_process_info.my_name), seg->name);
for (i=0; i < cnt; i++) {
opal_output(0, "\tKey: %s", keyvals[i]->key);
}
@ -282,7 +282,7 @@ int orte_gpr_replica_put_fn(orte_gpr_addr_mode_t addr_mode,
}
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_put: complete", ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_put: complete", ORTE_NAME_ARGS(orte_process_info.my_name));
}
return ORTE_SUCCESS;
@ -322,8 +322,8 @@ int orte_gpr_replica_get_fn(orte_gpr_addr_mode_t addr_mode,
if (orte_gpr_replica_globals.debug) {
char *token;
opal_output(0, "%s gpr_replica_get: entered",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_get: entered",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_output(0, "\tGetting data from segment %s with %d tokens and %d keys",
seg->name, num_tokens, num_keys);
for (i=0; i < num_tokens; i++) {
@ -520,8 +520,8 @@ CLEANUP:
OBJ_DESTRUCT(&get_list);
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_get: finished search",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_get: finished search",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
return rc;
@ -721,8 +721,8 @@ CLEANUP:
OBJ_DESTRUCT(&get_list);
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_get: finished search",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] gpr_replica_get: finished search",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
return rc;

Просмотреть файл

@ -52,8 +52,8 @@ int orte_gpr_replica_subscribe_fn(orte_process_name_t *requestor,
OPAL_TRACE(2);
if (orte_gpr_replica_globals.debug) {
opal_output(0, "%s gpr_replica_subscribe: entered with num_trigs:%d",
ORTE_NAME_PRINT(orte_process_info.my_name), num_trigs);
opal_output(0, "[%lu,%lu,%lu] gpr_replica_subscribe: entered with num_trigs:%d",
ORTE_NAME_ARGS(orte_process_info.my_name), num_trigs);
}
/* ensure one of the search arrays is clear - in this case, we

Просмотреть файл

@ -74,7 +74,7 @@ in the job. Note that the jobid=1 and the mask=2. So, we expect this
to collect the stdout from any of the ranks. Obviously the second
subscriber says the same thing but for stderr. The third subscriber
is for receving data from stdin and sending it out to rank 0 of
the job. Notice the mask=ff which means compare jobid,vpid
the job. Notice the mask=ff which means compare cellid,jobid,vpid
when addressing where the data goes.
The first endpoint is created by a call to pull by the rmgr. After
@ -90,6 +90,7 @@ tied to the subscription. Hmmm, this I do not really understand.
APPENDIX A
These are the defines that go with the mask.
#define ORTE_NS_CMP_NONE 0x00
#define ORTE_NS_CMP_CELLID 0x01
#define ORTE_NS_CMP_JOBID 0x02
#define ORTE_NS_CMP_VPID 0x04
#define ORTE_NS_CMP_ALL 0Xff

Просмотреть файл

@ -127,7 +127,7 @@ int orte_iof_proxy_unpublish(
#if 0
{
int i = 0;
opal_output(orte_iof_base.iof_output, "%s orted: ******** ABOUT TO IOF PROXY UNPUBLISH, %d", ORTE_NAME_PRINT(orte_process_info.my_name), getpid());
opal_output(orte_iof_base.iof_output, "[%lu,%lu,%lu] orted: ******** ABOUT TO IOF PROXY UNPUBLISH, %d", ORTE_NAME_ARGS(orte_process_info.my_name), getpid());
fflush(stderr);
while (0 == i) sleep(5);
}

Просмотреть файл

@ -114,8 +114,8 @@ orte_iof_svc_exception_handler(const orte_process_name_t* peer, orte_rml_excepti
{
orte_iof_base_endpoint_t *endpoint;
opal_output(orte_iof_base.iof_output,
"iof svc exception handler! %s\n",
ORTE_NAME_PRINT((orte_process_name_t*)peer));
"iof svc exception handler! [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(peer));
/* If we detect an exception on the RML connection to a peer,
delete all of its subscriptions and publications. Note that

Просмотреть файл

@ -174,10 +174,10 @@ static void orte_iof_svc_proxy_msg(
/* if the subscription origin doesn't match the message's
origin, skip this subscription */
if(orte_ns.compare_fields(sub->origin_mask,&sub->origin_name,&hdr->msg_origin) == 0) {
opal_output(orte_iof_base.iof_output, "sub origin %s, msg origin %s, msg proxy %s orte_iof_svc_proxy_msg: tag %d sequence %d, len %d\n",
ORTE_NAME_PRINT(&sub->origin_name),
ORTE_NAME_PRINT(&hdr->msg_origin),
ORTE_NAME_PRINT(&hdr->msg_proxy),
opal_output(orte_iof_base.iof_output, "sub origin [%lu,%lu,%lu], msg origin [%lu,%lu,%lu], msg proxy [%lu,%lu,%lu] orte_iof_svc_proxy_msg: tag %d sequence %d, len %d\n",
ORTE_NAME_ARGS(&sub->origin_name),
ORTE_NAME_ARGS(&hdr->msg_origin),
ORTE_NAME_ARGS(&hdr->msg_proxy),
hdr->msg_tag, hdr->msg_seq, hdr->msg_len);
/* Everthing matched -- forward the message */
OPAL_THREAD_UNLOCK(&mca_iof_svc_component.svc_lock);
@ -239,10 +239,10 @@ static void orte_iof_svc_proxy_pub(
orte_iof_base_pub_header_t* hdr)
{
int rc;
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_pub: mask %d, tag %d, proc %s, proxy %s",
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_pub: mask %d, tag %d, proc [%lu,%lu,%lu], proxy [%lu,%lu,%lu]",
hdr->pub_mask, hdr->pub_tag,
ORTE_NAME_PRINT(&hdr->pub_name),
ORTE_NAME_PRINT(&hdr->pub_proxy));
ORTE_NAME_ARGS(&hdr->pub_name),
ORTE_NAME_ARGS(&hdr->pub_proxy));
rc = orte_iof_svc_pub_create(
&hdr->pub_name,
@ -264,10 +264,10 @@ static void orte_iof_svc_proxy_unpub(
orte_iof_base_pub_header_t* hdr)
{
int rc;
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_unpub: mask %d, tag %d, proc %s, proxy %s",
opal_output(orte_iof_base.iof_output, "orte_iof_svc_proxy_unpub: mask %d, tag %d, proc [%lu,%lu,%lu], proxy [%lu,%lu,%lu]",
hdr->pub_mask, hdr->pub_tag,
ORTE_NAME_PRINT(&hdr->pub_name),
ORTE_NAME_PRINT(&hdr->pub_proxy));
ORTE_NAME_ARGS(&hdr->pub_name),
ORTE_NAME_ARGS(&hdr->pub_proxy));
rc = orte_iof_svc_pub_delete(
&hdr->pub_name,

Просмотреть файл

@ -54,8 +54,8 @@ int orte_iof_svc_pub_create(
pub->pub_tag = pub_tag;
pub->pub_endpoint =
orte_iof_base_endpoint_match(pub_name,pub_mask,pub_tag);
opal_output(orte_iof_base.iof_output, "created svc pub, name %s, proxy %s, tag %d / mask %x, endpoint %p\n",
ORTE_NAME_PRINT((orte_process_name_t*)pub_name), ORTE_NAME_PRINT((orte_process_name_t*)pub_proxy),
opal_output(orte_iof_base.iof_output, "created svc pub, name [%lu,%lu,%lu], proxy [%lu,%lu,%lu], tag %d / mask %x, endpoint %p\n",
ORTE_NAME_ARGS(pub_name), ORTE_NAME_ARGS(pub_proxy),
pub_tag, pub_mask, (char*) pub->pub_endpoint);
/* append this published endpoint to any matching subscription */

Просмотреть файл

@ -104,9 +104,9 @@ int orte_iof_svc_sub_create(
sub->target_mask = target_mask;
sub->target_tag = target_tag;
sub->sub_endpoint = orte_iof_base_endpoint_match(&sub->target_name, sub->target_mask, sub->target_tag);
opal_output(orte_iof_base.iof_output, "created svc sub, origin %s tag %d / mask %x, target %s, tag %d / mask %x\n",
ORTE_NAME_PRINT((orte_process_name_t*)origin_name), origin_tag, origin_mask,
ORTE_NAME_PRINT((orte_process_name_t*)target_name), target_tag, target_mask);
opal_output(orte_iof_base.iof_output, "created svc sub, origin [%lu,%lu,%lu] tag %d / mask %x, target [%lu,%lu,%lu], tag %d / mask %x\n",
ORTE_NAME_ARGS(origin_name), origin_tag, origin_mask,
ORTE_NAME_ARGS(target_name), target_tag, target_mask);
/* search through published endpoints for a match */
for(item = opal_list_get_first(&mca_iof_svc_component.svc_published);
@ -191,9 +191,9 @@ void orte_iof_svc_sub_ack(
orte_iof_svc_sub_t* sub = (orte_iof_svc_sub_t*)s_item;
opal_list_item_t *f_item;
opal_output(orte_iof_base.iof_output, "ack: checking sub origin %s tag %d / mask %x, target %s, tag %d / mask %x\n",
ORTE_NAME_PRINT(&sub->origin_name), sub->origin_tag, sub->origin_mask,
ORTE_NAME_PRINT(&sub->target_name), sub->target_tag, sub->target_mask);
opal_output(orte_iof_base.iof_output, "ack: checking sub origin [%lu,%lu,%lu] tag %d / mask %x, target [%lu,%lu,%lu], tag %d / mask %x\n",
ORTE_NAME_ARGS(&sub->origin_name), sub->origin_tag, sub->origin_mask,
ORTE_NAME_ARGS(&sub->target_name), sub->target_tag, sub->target_mask);
/* If the subscription origin/tag doesn't match the ACK
origin/tag, skip it */
@ -223,8 +223,8 @@ void orte_iof_svc_sub_ack(
orte_iof_svc_pub_t* pub = fwd->fwd_pub;
bool value_set = true;
opal_output(orte_iof_base.iof_output, "ack: checking fwd %s tag %d / mask %x\n",
ORTE_NAME_PRINT(&pub->pub_name), pub->pub_tag, pub->pub_mask);
opal_output(orte_iof_base.iof_output, "ack: checking fwd [%lu,%lu,%lu] tag %d / mask %x\n",
ORTE_NAME_ARGS(&pub->pub_name), pub->pub_tag, pub->pub_mask);
/* If the publication origin or publication proxy matches
the ACK'ing proxy, save the ACK'ed byte count for this
@ -521,12 +521,12 @@ int orte_iof_svc_fwd_create(
}
OBJ_RETAIN(pub);
fwd->fwd_pub = pub;
opal_output(orte_iof_base.iof_output, "created svc forward, sub origin %s, tag %d / mask %x, sub target %s, tag %d / mask %x :::: pub name %s, tag %d / mask %x\n",
ORTE_NAME_PRINT(&sub->origin_name), sub->origin_tag,
opal_output(orte_iof_base.iof_output, "created svc forward, sub origin [%lu,%lu,%lu], tag %d / mask %x, sub target [%lu,%lu,%lu], tag %d / mask %x :::: pub name [%lu,%lu,%lu], tag %d / mask %x\n",
ORTE_NAME_ARGS(&sub->origin_name), sub->origin_tag,
sub->origin_mask,
ORTE_NAME_PRINT(&sub->target_name), sub->target_tag,
ORTE_NAME_ARGS(&sub->target_name), sub->target_tag,
sub->target_mask,
ORTE_NAME_PRINT(&pub->pub_name), pub->pub_tag, pub->pub_mask);
ORTE_NAME_ARGS(&pub->pub_name), pub->pub_tag, pub->pub_mask);
opal_list_append(&sub->sub_forward, &fwd->super);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -24,11 +24,10 @@ libmca_ns_la_SOURCES += \
base/ns_base_close.c \
base/ns_base_select.c \
base/ns_base_open.c \
base/ns_base_node_fns.c \
base/ns_base_cell_fns.c \
base/ns_base_job_fns.c \
base/ns_base_vpid_name_fns.c \
base/ns_base_general_fns.c \
base/ns_base_print_name_args.c \
base/ns_base_diag_fns.c \
base/data_type_support/ns_data_type_compare_fns.c \
base/data_type_support/ns_data_type_copy_fns.c \

Просмотреть файл

@ -49,7 +49,6 @@ extern "C" {
ORTE_DECLSPEC int orte_ns_base_open(void);
ORTE_DECLSPEC int orte_ns_base_select(void);
ORTE_DECLSPEC int orte_ns_base_close(void);
ORTE_DECLSPEC int orte_ns_base_init_print_args(void);
/*
* globals that might be needed

Просмотреть файл

@ -49,6 +49,18 @@ int orte_ns_base_compare_name(orte_process_name_t *value1,
* value - a totally useless result, but consistent in behavior.
*/
/** check the cellids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
if (value1->cellid != ORTE_CELLID_WILDCARD &&
value2->cellid != ORTE_CELLID_WILDCARD) {
if (value1->cellid < value2->cellid) {
return ORTE_VALUE2_GREATER;
} else if (value1->cellid > value2->cellid) {
return ORTE_VALUE1_GREATER;
}
}
/** check the jobids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
@ -108,6 +120,21 @@ int orte_ns_base_compare_jobid(orte_jobid_t *value1,
return ORTE_EQUAL;
}
int orte_ns_base_compare_cellid(orte_cellid_t *value1,
orte_cellid_t *value2,
orte_data_type_t type)
{
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_CELLID_WILDCARD ||
*value2 == ORTE_CELLID_WILDCARD) return ORTE_EQUAL;
if (*value1 > *value2) return ORTE_VALUE1_GREATER;
if (*value2 > *value1) return ORTE_VALUE2_GREATER;
return ORTE_EQUAL;
}
int orte_ns_base_compare_nodeid(orte_nodeid_t *value1,
orte_nodeid_t *value2,
orte_data_type_t type)

Просмотреть файл

@ -44,6 +44,25 @@ int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_
return ORTE_SUCCESS;
}
/*
* CELLID
*/
int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type)
{
orte_cellid_t *val;
val = (orte_cellid_t*)malloc(sizeof(orte_cellid_t));
if (NULL == val) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*val = *src;
*dest = val;
return ORTE_SUCCESS;
}
/*
* NODEID
*/
@ -95,6 +114,7 @@ int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src,
return ORTE_ERR_OUT_OF_RESOURCE;
}
val->cellid = src->cellid;
val->jobid = src->jobid;
val->vpid = src->vpid;

Просмотреть файл

@ -39,9 +39,30 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, const void *src,
int rc;
orte_std_cntr_t i;
orte_process_name_t* proc;
orte_cellid_t *cellid;
orte_jobid_t *jobid;
orte_vpid_t *vpid;
/* collect all the cellids in a contiguous array */
cellid = (orte_cellid_t*)malloc(num_vals * sizeof(orte_cellid_t));
if (NULL == cellid) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
proc = (orte_process_name_t*)src;
for (i=0; i < num_vals; i++) {
cellid[i] = proc->cellid;
proc++;
}
/* now pack them in one shot */
if (ORTE_SUCCESS != (rc =
orte_ns_base_pack_cellid(buffer, cellid, num_vals, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
free(cellid);
return rc;
}
free(cellid);
/* collect all the jobids in a contiguous array */
jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t));
if (NULL == jobid) {
@ -85,6 +106,23 @@ int orte_ns_base_pack_name(orte_buffer_t *buffer, const void *src,
return ORTE_SUCCESS;
}
/*
* CELLID
*/
int orte_ns_base_pack_cellid(orte_buffer_t *buffer, const void *src,
orte_std_cntr_t num_vals, orte_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = orte_dss_pack_buffer(buffer, src, num_vals, ORTE_CELLID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* NODEID
*/

Просмотреть файл

@ -45,6 +45,10 @@ int orte_ns_base_std_print(char **output, char *prefix, void *src, orte_data_typ
orte_ns_base_quick_print(output, "ORTE_JOBID", prefix, src, sizeof(orte_jobid_t));
break;
case ORTE_CELLID:
orte_ns_base_quick_print(output, "ORTE_CELLID", prefix, src, sizeof(orte_cellid_t));
break;
case ORTE_NODEID:
orte_ns_base_quick_print(output, "ORTE_NODEID", prefix, src, sizeof(orte_nodeid_t));
break;
@ -69,8 +73,8 @@ int orte_ns_base_print_name(char **output, char *prefix, orte_process_name_t *na
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: NULL",
(NULL == prefix ? " " : prefix));
} else {
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%ld,%ld]",
(NULL == prefix ? " " : prefix),
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: [%ld,%ld,%ld]",
(NULL == prefix ? " " : prefix), (long)name->cellid,
(long)name->jobid, (long)name->vpid);
}

Просмотреть файл

@ -40,6 +40,10 @@ int orte_ns_base_std_size(size_t *size, void *src, orte_data_type_t type)
*size = sizeof(orte_jobid_t);
break;
case ORTE_CELLID:
*size = sizeof(orte_cellid_t);
break;
case ORTE_NODEID:
*size = sizeof(orte_nodeid_t);
break;

Просмотреть файл

@ -37,16 +37,34 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
int rc;
orte_std_cntr_t i, num;
orte_process_name_t* proc;
orte_cellid_t *cellid;
orte_jobid_t *jobid;
orte_vpid_t *vpid;
num = *num_vals;
/* allocate space for all the cellids in a contiguous array */
cellid = (orte_cellid_t*)malloc(num * sizeof(orte_cellid_t));
if (NULL == cellid) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*num_vals = 0;
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* now unpack them in one shot */
if (ORTE_SUCCESS != (rc =
orte_ns_base_unpack_cellid(buffer, cellid, num_vals, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
*num_vals = 0;
free(cellid);
return rc;
}
/* allocate space for all the jobids in a contiguous array */
jobid = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t));
if (NULL == jobid) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*num_vals = 0;
free(cellid);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* now unpack them in one shot */
@ -55,6 +73,7 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
*num_vals = 0;
free(jobid);
free(cellid);
return rc;
}
@ -64,6 +83,7 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
*num_vals = 0;
free(jobid);
free(cellid);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* now unpack them in one shot */
@ -73,12 +93,14 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
*num_vals = 0;
free(vpid);
free(jobid);
free(cellid);
return rc;
}
/* build the names from the jobid/vpid arrays */
/* build the names from the cellid/jobid/vpid arrays */
proc = (orte_process_name_t*)dest;
for (i=0; i < num; i++) {
proc->cellid = cellid[i];
proc->jobid = jobid[i];
proc->vpid = vpid[i];
proc++;
@ -87,10 +109,27 @@ int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
/* cleanup */
free(vpid);
free(jobid);
free(cellid);
return ORTE_SUCCESS;
}
/*
* CELLID
*/
int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type)
{
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = orte_dss_unpack_buffer(buffer, dest, num_vals, ORTE_CELLID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
/*
* NODEID
*/

Просмотреть файл

@ -44,7 +44,25 @@
* "not available" functions
*/
int
orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodename)
orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid, char *site, char *resource)
{
*cellid = ORTE_CELLID_INVALID;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
char **site, char **resource)
{
*site = NULL;
*resource = NULL;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, orte_cellid_t cellid, char **nodename)
{
*nodeids = NULL;
*nnodes = 0;
@ -53,7 +71,8 @@ orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr
}
int
orte_ns_base_get_node_info_not_available(char ***nodenames, orte_std_cntr_t num_nodeids, orte_nodeid_t *nodeids)
orte_ns_base_get_node_info_not_available(char ***nodenames, orte_cellid_t cellid,
orte_std_cntr_t num_nodeids, orte_nodeid_t *nodeids)
{
*nodenames = NULL;
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
@ -61,6 +80,95 @@ orte_ns_base_get_node_info_not_available(char ***nodenames, orte_std_cntr_t num_
}
/**** CELL STRING FUNCTIONS ****/
int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name)
{
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid_string = NULL;
return ORTE_ERR_BAD_PARAM;
}
/* check for wildcard value - handle appropriately */
if (ORTE_CELLID_WILDCARD == name->cellid) {
*cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_CELLID_INVALID == name->cellid) {
*cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(cellid_string, "%ld", (long) name->cellid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid)
{
/* check for wildcard value - handle appropriately */
if (ORTE_CELLID_WILDCARD == cellid) {
*cellid_string = strdup(ORTE_SCHEMA_WILDCARD_STRING);
return ORTE_SUCCESS;
}
/* check for invalid value - handle appropriately */
if (ORTE_CELLID_INVALID == cellid) {
*cellid_string = strdup(ORTE_SCHEMA_INVALID_STRING);
return ORTE_SUCCESS;
}
if (0 > asprintf(cellid_string, "%ld", (long) cellid)) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring)
{
long int tmpint;
if (NULL == cellidstring) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid = ORTE_CELLID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
/** check for wildcard string - handle appropriately */
if (0 == strcmp(ORTE_SCHEMA_WILDCARD_STRING, cellidstring)) {
*cellid = ORTE_CELLID_WILDCARD;
return ORTE_SUCCESS;
}
/** check for invalid string - handle appropriately */
if (0 == strcmp(ORTE_SCHEMA_INVALID_STRING, cellidstring)) {
*cellid = ORTE_CELLID_INVALID;
return ORTE_SUCCESS;
}
tmpint = strtol(cellidstring, NULL, 10);
if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) {
*cellid = (orte_cellid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
*cellid = ORTE_CELLID_INVALID;
return ORTE_ERR_BAD_PARAM;
}
return ORTE_SUCCESS;
}
/**** NODEID STRING FUNCTIONS ****/
int orte_ns_base_convert_nodeid_to_string(char **string, const orte_nodeid_t nodeid)
{

Просмотреть файл

@ -45,6 +45,13 @@
* "not available" functions
*/
int
orte_ns_base_dump_cells_not_available(void)
{
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
int
orte_ns_base_dump_jobs_not_available(void)
{

Просмотреть файл

@ -44,9 +44,9 @@
* globals
*/
orte_process_name_t orte_ns_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
orte_process_name_t orte_ns_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
orte_process_name_t orte_ns_name_my_hnp = {0, 0};
orte_process_name_t orte_ns_name_wildcard = {ORTE_CELLID_WILDCARD, ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD};
orte_process_name_t orte_ns_name_invalid = {ORTE_CELLID_INVALID, ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
orte_process_name_t orte_ns_name_my_hnp = {0, 0, 0};
/*
* Global variables
@ -55,6 +55,12 @@ int mca_ns_base_output = -1;
mca_ns_base_module_t orte_ns = {
/* init */
orte_ns_base_module_init_not_available,
/* cell functions */
orte_ns_base_create_cellid_not_available,
orte_ns_base_get_cell_info_not_available,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/* node functions */
orte_ns_base_create_nodeids_not_available,
orte_ns_base_get_node_info_not_available,
@ -89,6 +95,7 @@ mca_ns_base_module_t orte_ns = {
/* data type functions */
orte_ns_base_define_data_type_not_available,
/* diagnostic functions */
orte_ns_base_dump_cells_not_available,
orte_ns_base_dump_jobs_not_available,
orte_ns_base_dump_tags_not_available,
orte_ns_base_dump_datatypes_not_available,
@ -150,12 +157,6 @@ int orte_ns_base_open(void)
}
mca_ns_base_output = opal_output_open(&kill_prefix);
/* setup the print_args function */
if (ORTE_SUCCESS != (rc = orte_ns_base_init_print_args())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* register the base system types with the DPS */
tmp = ORTE_NAME;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_name,
@ -199,7 +200,21 @@ int orte_ns_base_open(void)
return rc;
}
/* Open up all available components */
tmp = ORTE_CELLID;
if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ns_base_pack_cellid,
orte_ns_base_unpack_cellid,
(orte_dss_copy_fn_t)orte_ns_base_copy_cellid,
(orte_dss_compare_fn_t)orte_ns_base_compare_cellid,
(orte_dss_size_fn_t)orte_ns_base_std_size,
(orte_dss_print_fn_t)orte_ns_base_std_print,
(orte_dss_release_fn_t)orte_ns_base_std_release,
ORTE_DSS_UNSTRUCTURED,
"ORTE_CELLID", &tmp))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* Open up all available components */
if (ORTE_SUCCESS !=
mca_base_components_open("ns", mca_ns_base_output,

Просмотреть файл

@ -1,82 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <stdio.h>
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/printf.h"
#include "opal/threads/tsd.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns_types.h"
#include "orte/mca/ns/base/base.h"
#define ORTE_PRINT_NAME_ARGS_MAX_SIZE 20
static opal_tsd_key_t print_args_tsd_key;
char* orte_print_args_null = "NULL";
static void
buffer_cleanup(void *value)
{
if (NULL != value) free(value);
}
static char*
get_print_name_buffer(void)
{
void *buffer;
int ret;
ret = opal_tsd_getspecific(print_args_tsd_key, &buffer);
if (OPAL_SUCCESS != ret) return NULL;
if (NULL == buffer) {
buffer = (void*) malloc((ORTE_PRINT_NAME_ARGS_MAX_SIZE+1) * sizeof(char));
ret = opal_tsd_setspecific(print_args_tsd_key, buffer);
}
return (char*) buffer;
}
char* orte_ns_base_print_name_args(orte_process_name_t *name)
{
char *print_name_buf = get_print_name_buffer();
if (NULL == print_name_buf) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return orte_print_args_null;
}
if (NULL == name) {
snprintf(print_name_buf, ORTE_PRINT_NAME_ARGS_MAX_SIZE, "[NO-NAME]");
} else {
snprintf(print_name_buf, ORTE_PRINT_NAME_ARGS_MAX_SIZE, "[%ld,%ld]", (long)name->jobid, (long)name->vpid);
}
return print_name_buf;
}
int
orte_ns_base_init_print_args(void)
{
return opal_tsd_key_create(&print_args_tsd_key, buffer_cleanup);
}

Просмотреть файл

@ -55,7 +55,7 @@ orte_ns_base_create_my_name_not_available(void)
int orte_ns_base_get_proc_name_string(char **name_string,
const orte_process_name_t* name)
{
char *tmp;
char *tmp, *tmp2;
if (NULL == name) { /* got an error */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
@ -66,23 +66,32 @@ int orte_ns_base_get_proc_name_string(char **name_string,
* corresponding string so we can correctly parse the name string when
* it is passed back to us later
*/
if (ORTE_JOBID_WILDCARD == name->jobid) {
asprintf(&tmp, "%s", ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_JOBID_INVALID == name->jobid) {
asprintf(&tmp, "%s", ORTE_SCHEMA_INVALID_STRING);
if (ORTE_CELLID_WILDCARD == name->cellid) {
tmp = strdup(ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_CELLID_INVALID == name->cellid) {
tmp = strdup(ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(&tmp, "%ld", (long)name->jobid);
asprintf(&tmp, "%ld", (long)name->cellid);
}
if (ORTE_VPID_WILDCARD == name->vpid) {
asprintf(name_string, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_VPID_INVALID == name->vpid) {
asprintf(name_string, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
if (ORTE_JOBID_WILDCARD == name->jobid) {
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_JOBID_INVALID == name->jobid) {
asprintf(&tmp2, "%s%c%s", tmp, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(name_string, "%s%c%ld", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (long)name->vpid);
asprintf(&tmp2, "%s%c%ld", tmp, ORTE_SCHEMA_DELIMITER_CHAR, (long)name->jobid);
}
free(tmp);
if (ORTE_VPID_WILDCARD == name->vpid) {
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_WILDCARD_STRING);
} else if (ORTE_VPID_INVALID == name->vpid) {
asprintf(name_string, "%s%c%s", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, ORTE_SCHEMA_INVALID_STRING);
} else {
asprintf(name_string, "%s%c%ld", tmp2, ORTE_SCHEMA_DELIMITER_CHAR, (long)name->vpid);
}
free(tmp2);
return ORTE_SUCCESS;
}
@ -90,6 +99,7 @@ int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
const char* name_string)
{
char *temp, *token;
orte_cellid_t cell;
orte_jobid_t job;
orte_vpid_t vpid;
long int tmpint;
@ -102,16 +112,45 @@ int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
}
temp = strdup(name_string); /** copy input string as the strtok process is destructive */
token = strtok(temp, ORTE_SCHEMA_DELIMITER_STRING); /** get first field -> jobid */
token = strtok(temp, ORTE_SCHEMA_DELIMITER_STRING); /** get first field -> cellid */
/* check for error */
if (NULL == token) {
return ORTE_ERR_BAD_PARAM;
}
/* convert to largest possible int - then
* check to ensure it is within range of cellid_t before casting
*/
/* first, though, check for WILDCARD character - assign
* value accordingly, if found
*/
if (0 == strcmp(token, ORTE_SCHEMA_WILDCARD_STRING)) {
cell = ORTE_CELLID_WILDCARD;
} else if (0 == strcmp(token, ORTE_SCHEMA_INVALID_STRING)) {
cell = ORTE_CELLID_INVALID;
} else {
tmpint = strtol(token, NULL, 10);
if (ORTE_CELLID_MAX >= tmpint && ORTE_CELLID_MIN <= tmpint) {
cell = (orte_cellid_t)tmpint;
} else {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return_code = ORTE_ERR_BAD_PARAM;
goto CLEANUP;
}
}
token = strtok(NULL, ORTE_SCHEMA_DELIMITER_STRING); /** get next field -> jobid */
/** convert to largest possible int - then
* check to ensure it is within range of jobid_t before casting */
/* check for error */
if (NULL == token) {
return ORTE_ERR_BAD_PARAM;
}
/** first, though, check for WILDCARD character - assign
* value accordingly, if found
*/
@ -159,7 +198,7 @@ int orte_ns_base_convert_string_to_process_name(orte_process_name_t **name,
}
if (ORTE_SUCCESS != (return_code =
orte_ns_base_create_process_name(name, job, vpid))) {
orte_ns_base_create_process_name(name, cell, job, vpid))) {
ORTE_ERROR_LOG(return_code);
}
@ -171,6 +210,7 @@ CLEANUP:
/**** CREATE PROCESS NAME ****/
int orte_ns_base_create_process_name(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid)
{
@ -182,6 +222,7 @@ int orte_ns_base_create_process_name(orte_process_name_t **name,
return ORTE_ERR_OUT_OF_RESOURCE;
}
(*name)->cellid = cell;
(*name)->jobid = job;
(*name)->vpid = vpid;
return ORTE_SUCCESS;
@ -296,8 +337,16 @@ int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
* function does not actually stand for a wildcard value, but
* rather a specific value
*/
/* check job id */
if (ORTE_NS_CMP_CELLID & fields) { /* check cellid field */
if (name1->cellid < name2->cellid) {
return ORTE_VALUE2_GREATER;
} else if (name1->cellid > name2->cellid) {
return ORTE_VALUE1_GREATER;
}
}
/* get here if cellid's are equal, or cellid not being checked */
/* now check job id */
if (ORTE_NS_CMP_JOBID & fields) {
if (name1->jobid < name2->jobid) {
@ -307,7 +356,8 @@ int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
}
}
/* get here if jobid's are equal, or not being checked
/* get here if cellid's and jobid's are equal, or neither being checked,
* or cellid not checked and jobid's equal.
* now check vpid
*/
@ -320,7 +370,8 @@ int orte_ns_base_compare_fields(orte_ns_cmp_bitmask_t fields,
}
/* only way to get here is if all fields are being checked and are equal,
* or jobid not checked, but vpid equal,
* or cellid not checked, but jobid and vpid equal,
* or cellid and jobid not checked, but vpid equal,
* only vpid being checked, and equal
* return that fact
*/

Просмотреть файл

@ -57,6 +57,7 @@ typedef uint8_t orte_ns_cmd_flag_t;
* typedefs above and in ns_types.h
*/
#define ORTE_NS_CMD ORTE_INT8
#define ORTE_CELLID_T ORTE_INT32
#define ORTE_NODEID_T ORTE_INT32
#define ORTE_JOBID_T ORTE_INT32
#define ORTE_VPID_T ORTE_INT32
@ -64,6 +65,8 @@ typedef uint8_t orte_ns_cmd_flag_t;
/*
* define flag values for remote commands - only used internally
*/
#define ORTE_NS_CREATE_CELLID_CMD (int8_t) 1
#define ORTE_NS_GET_CELL_INFO_CMD (int8_t) 2
#define ORTE_NS_CREATE_NODEID_CMD (int8_t) 3
#define ORTE_NS_GET_NODE_INFO_CMD (int8_t) 4
#define ORTE_NS_CREATE_JOBID_CMD (int8_t) 5
@ -76,6 +79,7 @@ typedef uint8_t orte_ns_cmd_flag_t;
#define ORTE_NS_GET_PEERS_CMD (int8_t) 12
#define ORTE_NS_DEFINE_DATA_TYPE_CMD (int8_t) 13
#define ORTE_NS_CREATE_MY_NAME_CMD (int8_t) 14
#define ORTE_NS_DUMP_CELLS_CMD (int8_t) 15
#define ORTE_NS_DUMP_JOBIDS_CMD (int8_t) 16
#define ORTE_NS_DUMP_TAGS_CMD (int8_t) 17
#define ORTE_NS_DUMP_DATATYPES_CMD (int8_t) 18
@ -88,6 +92,7 @@ typedef uint8_t orte_ns_cmd_flag_t;
*/
ORTE_DECLSPEC int orte_ns_base_create_process_name(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
@ -109,11 +114,19 @@ ORTE_DECLSPEC int orte_ns_base_convert_jobid_to_string(char **jobid_string, c
ORTE_DECLSPEC int orte_ns_base_convert_string_to_jobid(orte_jobid_t *jobid, const char* jobidstring);
ORTE_DECLSPEC int orte_ns_base_get_cellid_string(char **cellid_string, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_cellid(orte_cellid_t *cellid, const char *cellidstring);
ORTE_DECLSPEC int orte_ns_base_convert_cellid_to_string(char **cellid_string, const orte_cellid_t cellid);
ORTE_DECLSPEC int orte_ns_base_get_vpid(orte_vpid_t *vpid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_get_jobid(orte_jobid_t *jobid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *nodeid, const char *string);
ORTE_DECLSPEC int orte_ns_base_get_cellid(orte_cellid_t *cellid, const orte_process_name_t* name);
ORTE_DECLSPEC int orte_ns_base_convert_string_to_nodeid(orte_nodeid_t *cellid, const char *string);
ORTE_DECLSPEC int orte_ns_base_convert_nodeid_to_string(char **nodeid_string, const orte_nodeid_t nodeid);
@ -127,10 +140,16 @@ ORTE_DECLSPEC int orte_ns_base_print_dump(orte_buffer_t *buffer);
/* not available functions */
ORTE_DECLSPEC int orte_ns_base_module_init_not_available(void);
ORTE_DECLSPEC int orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
char **nodename);
ORTE_DECLSPEC int orte_ns_base_create_cellid_not_available(orte_cellid_t *cellid,
char *site, char *resource);
ORTE_DECLSPEC int orte_ns_base_get_node_info_not_available(char ***nodename,
ORTE_DECLSPEC int orte_ns_base_get_cell_info_not_available(orte_cellid_t cellid,
char **site, char **resource);
ORTE_DECLSPEC int orte_ns_base_create_nodeids_not_available(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodename);
ORTE_DECLSPEC int orte_ns_base_get_node_info_not_available(char ***nodename, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
ORTE_DECLSPEC int orte_ns_base_create_jobid_not_available(orte_jobid_t *jobid, opal_list_t *attrs);
@ -167,6 +186,7 @@ ORTE_DECLSPEC int orte_ns_base_create_my_name_not_available(void);
ORTE_DECLSPEC int orte_ns_base_get_peers_not_available(orte_process_name_t **procs,
orte_std_cntr_t *num_procs, opal_list_t *attributes);
ORTE_DECLSPEC int orte_ns_base_dump_cells_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_jobs_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_tags_not_available(void);
ORTE_DECLSPEC int orte_ns_base_dump_datatypes_not_available(void);
@ -178,6 +198,9 @@ ORTE_DECLSPEC int orte_ns_base_ft_event_not_available(int state);
ORTE_DECLSPEC int orte_ns_base_pack_name(orte_buffer_t *buffer, const void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_cellid(orte_buffer_t *buffer, const void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_pack_nodeid(orte_buffer_t *buffer, const void *src,
orte_std_cntr_t num_vals, orte_data_type_t type);
@ -190,6 +213,9 @@ ORTE_DECLSPEC int orte_ns_base_pack_vpid(orte_buffer_t *buffer, const void *s
ORTE_DECLSPEC int orte_ns_base_unpack_name(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_cellid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
ORTE_DECLSPEC int orte_ns_base_unpack_nodeid(orte_buffer_t *buffer, void *dest,
orte_std_cntr_t *num_vals, orte_data_type_t type);
@ -207,6 +233,8 @@ int orte_ns_base_copy_name(orte_process_name_t **dest, orte_process_name_t *src,
int orte_ns_base_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, orte_data_type_t type);
int orte_ns_base_copy_cellid(orte_cellid_t **dest, orte_cellid_t *src, orte_data_type_t type);
int orte_ns_base_copy_nodeid(orte_nodeid_t **dest, orte_nodeid_t *src, orte_data_type_t type);
int orte_ns_base_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, orte_data_type_t type);
@ -228,6 +256,10 @@ int orte_ns_base_compare_jobid(orte_jobid_t *value1,
orte_jobid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_cellid(orte_cellid_t *value1,
orte_cellid_t *value2,
orte_data_type_t type);
int orte_ns_base_compare_nodeid(orte_nodeid_t *value1,
orte_nodeid_t *value2,
orte_data_type_t type);

Просмотреть файл

@ -59,25 +59,116 @@ extern "C" {
*/
typedef int (*orte_ns_base_module_init_fn_t)(void);
/**** CELL FUNCTIONS ****/
/**
* Create a new cell id.
* Allocates a new cell id for use by the caller. The function returns an
* existing cellid if the specified site/resource already has been assigned
* one.
*
* @param site The name of the site where the cell is located.
* @param resource The name of the resource associated with this cell (e.g., the name
* of the cluster).
* @param cellid The location where the cellid is to be stored.
*
* @retval ORTE_SUCCESS A cellid was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*
* @endcode
*/
typedef int (*orte_ns_base_module_create_cellid_fn_t)(orte_cellid_t *cellid,
char *site, char *resource);
/**
* Get cell info
* Retrieve the site and resource info on a cell.
*
* @param cellid The id of the cell who's info is being requested.
* @param site Returns a pointer to a strdup'd string containing the site name.
* @param resource Returns a pointer to a strdup'd string containg the resource name.
* @retval ORTE_SUCCESS A cellid was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*/
typedef int (*orte_ns_base_module_get_cell_info_fn_t)(orte_cellid_t cellid,
char **site, char **resource);
/**
* Get the cell id as a character string.
* The get_cellid_string() function returns the cell id in a character string
* representation. The string is created by expressing the field in hexadecimal. Memory
* for the string is allocated by the function - releasing that allocation is the
* responsibility of the calling program.
*
* @param *name A pointer to the name structure containing the name to be
* "translated" to a string.
*
* @retval *name_string A pointer to the character string representation of the
* cell id.
* @retval NULL Indicates an error occurred - either no memory could be allocated
* or the caller provided an incorrect name pointer (e.g., NULL).
*
* @code
* cellid-string = ompi_name_server.get_cellid_string(&name)
* @endcode
*/
typedef int (*orte_ns_base_module_get_cellid_string_fn_t)(char **cellid_string, const orte_process_name_t* name);
/**
* Convert cellid to character string
* Returns the cellid in a character string representation. The string is created
* by expressing the provided cellid in hexadecimal. Memory for the string is
* allocated by the function - releasing that allocation is the responsibility of
* the calling program.
*
* @param cellid The cellid to be converted.
*
* @retval *cellid_string A pointer to a character string representation of the cellid.
* @retval NULL Indicates an error occurred - probably no memory could be allocated.
*
* @code
* cellid-string = ompi_name_server.convert_cellid_to_string(cellid);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_cellid_to_string_fn_t)(char **cellid_string, const orte_cellid_t cellid);
/**
* Convert a string to a cellid.
* Converts a characters string into a cellid. The character string must be a
* hexadecimal representation of a valid cellid.
*
* @param cellidstring The string to be converted.
*
* @retval cellid The resulting cellid
* @retval MCA_NS_BASE_CELLID_MAX String could not be converted.
*
* @code
* cellid = ompi_name_server.convert_string_to_cellid(cellidstring);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_cellid_fn_t)(orte_cellid_t *cellid, const char *cellidstring);
/**** NODE FUNCTIONS ****/
/*
* Get an array of node id's
* Given a NULL-terminated array of names of nodes within it, this function assigns an id to represent
* each node.
* Given the cell and a NULL-terminated array of names of nodes within it, this function assigns an id to represent
* each node within the cell.
*/
typedef int (*orte_ns_base_module_create_nodeids_fn_t)(orte_nodeid_t **nodes, orte_std_cntr_t *nnodes,
char **nodenames);
orte_cellid_t cellid, char **nodename);
/*
* Get node info
* Retrieve the names of an array of nodes given their nodeids.
* Retrieve the names of an array of nodes given their cellid and nodeids. The cellid
* is required as the nodeids are only unique within a given cell.
*
* @param cellid The id of the cell of the node.
* @param nodeids The ids of the node.
* @param nodenames Returns a pointer to a NULL-terminated array of strdup'd strings containing the node names.
* @retval ORTE_SUCCESS The nodename was created and returned.
* @retval ORTE_ERROR_VALUE An error code indicative of the problem.
*/
typedef int (*orte_ns_base_module_get_node_info_fn_t)(char ***nodename,
typedef int (*orte_ns_base_module_get_node_info_fn_t)(char ***nodename, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
/*
@ -169,6 +260,8 @@ typedef int (*orte_ns_base_module_get_parent_job_fn_t)(orte_jobid_t *parent, ort
/**
* Reserve a range of process id's.
* The reserve_range() function reserves a range of vpid's for the given jobid.
* Note that the cellid does not factor into this request - jobid's span the entire universe,
* hence the cell where the process is currently executing is irrelevant to this request.
*
* @param jobid The id of the job for which the vpid's are to be reserved.
* @param range The number of vpid's to be reserved. The function will find the
@ -257,7 +350,13 @@ typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jo
* The create_process_name() function creates a single process name structure and fills the
* fields with the provided values.
*
* @param job The id of the job to which the process will belong.
* @param cell The cell for which the process name is intended. Usually, this is
* the id of the cell where the process is initially planning to be spawned.
* @param job The id of the job to which the process will belong. Process id's are
* tracked according to jobid, but not cellid. Thus, two processes
* can have the same process id if and only if they have different jobid's. However,
* two processes in the same jobid cannot have the same process id, regardless
* of whether or not they are in the same cell.
* @param vpid The virtual process id for the name. Note that no check is made for uniqueness -
* the caller is responsible for ensuring that the requested name is, in fact, unique
* by first requesting reservation of an appropriate range of virtual process id's.
@ -271,6 +370,7 @@ typedef int (*orte_ns_base_module_convert_string_to_jobid_fn_t)(orte_jobid_t *jo
* @endcode
*/
typedef int (*orte_ns_base_module_create_proc_name_fn_t)(orte_process_name_t **name,
orte_cellid_t cell,
orte_jobid_t job,
orte_vpid_t vpid);
@ -289,14 +389,17 @@ typedef int (*orte_ns_base_module_create_my_name_fn_t)(void);
* Convert a string representation to a process name.
* The convert_string_to_process_name() function converts a string representation of a process
* name into an Open MPI name structure. The string must be of the proper form - i.e., it
* must be in the form "jobid.vpid", where each field is expressed in hexadecimal form.
* must be in the form "cellid.jobid.vpid", where each field is expressed in hexadecimal form.
*
* @param *name_string A character string representation of a process name.
*
* @retval *name Pointer to an orte_process_name_t structure containing the name.
* @retval *name Pointer to an ompi_process_name_t structure containing the name.
* @retval NULL Indicates an error, probably due to inability to allocate memory for
* the name structure.
*
* @code
* name = ompi_name_server.convert_string_to_process_name(name_string);
* @endcode
*/
typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_process_name_t **name,
const char* name_string);
@ -305,7 +408,10 @@ typedef int (*orte_ns_base_module_convert_string_to_process_name_fn_t)(orte_proc
/**
* Get the process name as a character string.
* The get_proc_name_string() function returns the entire process name in a
* character string representation.
* character string representation. The string is created by expressing each
* field in hexadecimal separated by periods, as follows:
*
* sprintf(string_name, "%x.%x.%x", cellid, jobid, vpid)
*
* The memory required for the string is allocated by the function - releasing
* that allocation is the responsibility of the calling program.
@ -330,13 +436,13 @@ typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
* The compare() function checks the value of the fields in the two
* provided names, and returns a value indicating if the first one is less than, greater
* than, or equal to the second. The value of each field is compared in a hierarchical
* fashion, with jobid and vpid in sequence. The bit-mask
* fashion, with cellid first, followed by jobid and vpid in sequence. The bit-mask
* indicates which fields are to be included in the comparison. Fields not included via the
* bit-mask are ignored. Thus, the caller may request that any combination of the two fields
* bit-mask are ignored. Thus, the caller may request that any combination of the three fields
* be included in the comparison.
*
* @param fields A bit-mask indicating which fields are to be included in the comparison. The
* comparison is performed on a hierarchical basis, with
* comparison is performed on a hierarchical basis, with cellid being first, followed by
* jobid and then vpid. Each field can be included separately, thus allowing the caller
* to configure the comparison to meet their needs.
* @param *name1 A pointer to the first name structure.
@ -348,6 +454,11 @@ typedef int (*orte_ns_base_module_get_proc_name_string_fn_t)(char **name_string,
* @retval +1 The indicated fields of the first provided name is greater than the same
* fields of the second provided name.
*
* The function returns a large negative value if there is an error.
*
* @code
* result = ompi_name_server.compare(bit_mask, &name1, &name2)
* @endcode
*/
typedef int (*orte_ns_base_module_compare_fields_fn_t)(orte_ns_cmp_bitmask_t fields,
const orte_process_name_t* name1,
@ -451,9 +562,9 @@ typedef int (*orte_ns_base_module_define_data_type_fn_t)(
* request that all peers for the parent job be returned, for example.
* More common options would be to specify a cell or job.
*
* NOTE ORTE_JOBID_WILDCARD
* NOTE The combination of ORTE_CELLID_WILDCARD and ORTE_JOBID_WILDCARD
* in the attribute list will cause the function to return the names of *all*
* processes currently active.
* processes currently active in the universe.
*
*/
typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
@ -464,6 +575,8 @@ typedef int (*orte_ns_base_module_get_peers_fn_t)(orte_process_name_t **procs,
/*
* DIAGNOSTIC INTERFACES
*/
typedef int (*orte_ns_base_module_dump_cells_fn_t)(void);
typedef int (*orte_ns_base_module_dump_jobs_fn_t)(void);
typedef int (*orte_ns_base_module_dump_tags_fn_t)(void);
@ -478,6 +591,12 @@ typedef int (*orte_ns_base_module_ft_event_fn_t)(int state);
struct mca_ns_base_module_2_0_0_t {
/* init */
orte_ns_base_module_init_fn_t init;
/* cell functions */
orte_ns_base_module_create_cellid_fn_t create_cellid;
orte_ns_base_module_get_cell_info_fn_t get_cell_info;
orte_ns_base_module_get_cellid_string_fn_t get_cellid_string;
orte_ns_base_module_convert_cellid_to_string_fn_t convert_cellid_to_string;
orte_ns_base_module_convert_string_to_cellid_fn_t convert_string_to_cellid;
/** node functions */
orte_ns_base_module_create_nodeids_fn_t create_nodeids;
orte_ns_base_module_get_node_info_fn_t get_node_info;
@ -512,6 +631,7 @@ struct mca_ns_base_module_2_0_0_t {
/* data type functions */
orte_ns_base_module_define_data_type_fn_t define_data_type;
/* diagnostic functions */
orte_ns_base_module_dump_cells_fn_t dump_cells;
orte_ns_base_module_dump_jobs_fn_t dump_jobs;
orte_ns_base_module_dump_tags_fn_t dump_tags;
orte_ns_base_module_dump_datatypes_fn_t dump_datatypes;

Просмотреть файл

@ -50,6 +50,7 @@ extern "C" {
/**** NS ATTRIBUTES ****/
#define ORTE_NS_USE_PARENT "orte-ns-use-parent"
#define ORTE_NS_USE_ROOT "orte-ns-use-root"
#define ORTE_NS_USE_CELL "orte-ns-use-cell"
#define ORTE_NS_USE_JOBID "orte-ns-use-job"
#define ORTE_NS_USE_NODE "orte-ns-use-node"
#define ORTE_NS_INCLUDE_DESCENDANTS "orte-ns-include-desc"
@ -58,6 +59,7 @@ extern "C" {
#define ORTE_NAME_ARGS(n) \
(long) ((NULL == n) ? (long)-1 : (long)(n)->cellid), \
(long) ((NULL == n) ? (long)-1 : (long)(n)->jobid), \
(long) ((NULL == n) ? (long)-1 : (long)(n)->vpid)
@ -67,6 +69,7 @@ extern "C" {
*/
#define ORTE_NS_CMP_NONE 0x00
#define ORTE_NS_CMP_CELLID 0x01
#define ORTE_NS_CMP_JOBID 0x02
#define ORTE_NS_CMP_VPID 0x04
#define ORTE_NS_CMP_ALL 0Xff
@ -83,26 +86,23 @@ extern "C" {
* ns_private.h
*/
typedef orte_std_cntr_t orte_jobid_t;
typedef orte_std_cntr_t orte_cellid_t;
typedef orte_std_cntr_t orte_nodeid_t;
typedef orte_std_cntr_t orte_vpid_t;
typedef uint8_t orte_ns_cmp_bitmask_t; /**< Bit mask for comparing process names */
struct orte_process_name_t {
orte_cellid_t cellid; /**< Cell number */
orte_jobid_t jobid; /**< Job number */
orte_vpid_t vpid; /**< Process number */
};
typedef struct orte_process_name_t orte_process_name_t;
/* useful define to print name args in output messages */
ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *name);
#define ORTE_NAME_PRINT(n) \
orte_ns_base_print_name_args(n)
/*
* define maximum value for id's in any field
*/
#define ORTE_CELLID_MAX ORTE_STD_CNTR_MAX
#define ORTE_JOBID_MAX ORTE_STD_CNTR_MAX
#define ORTE_VPID_MAX ORTE_STD_CNTR_MAX
#define ORTE_NODEID_MAX ORTE_STD_CNTR_MAX
@ -110,6 +110,7 @@ ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *nam
/*
* define minimum value for id's in any field
*/
#define ORTE_CELLID_MIN ORTE_STD_CNTR_MIN
#define ORTE_JOBID_MIN ORTE_STD_CNTR_MIN
#define ORTE_VPID_MIN ORTE_STD_CNTR_MIN
#define ORTE_NODEID_MIN ORTE_STD_CNTR_MIN
@ -117,6 +118,7 @@ ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *nam
/*
* define invalid values
*/
#define ORTE_CELLID_INVALID (ORTE_CELLID_MIN + 1)
#define ORTE_JOBID_INVALID (ORTE_JOBID_MIN + 1)
#define ORTE_VPID_INVALID (ORTE_VPID_MIN + 1)
#define ORTE_NODEID_INVALID (ORTE_NODEID_MIN + 1)
@ -124,6 +126,7 @@ ORTE_DECLSPEC extern char* orte_ns_base_print_name_args(orte_process_name_t *nam
/*
* define wildcard values (should be -1)
*/
#define ORTE_CELLID_WILDCARD -1
#define ORTE_JOBID_WILDCARD -1
#define ORTE_VPID_WILDCARD -1
#define ORTE_NODEID_WILDCARD -1
@ -149,6 +152,7 @@ ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_my_hnp; /** instantiated
* @param name
*/
#define ORTE_PROCESS_NAME_HTON(n) \
n.cellid = htonl(n.cellid); \
n.jobid = htonl(n.jobid); \
n.vpid = htonl(n.vpid);
@ -158,6 +162,7 @@ ORTE_DECLSPEC extern orte_process_name_t orte_ns_name_my_hnp; /** instantiated
* @param name
*/
#define ORTE_PROCESS_NAME_NTOH(n) \
n.cellid = ntohl(n.cellid); \
n.jobid = ntohl(n.jobid); \
n.vpid = ntohl(n.vpid);

Просмотреть файл

@ -75,6 +75,8 @@ int orte_ns_proxy_finalize(void);
typedef struct {
size_t max_size, block_size;
int debug;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
orte_pointer_array_t *tags;
orte_rml_tag_t num_tags;
orte_pointer_array_t *dts;
@ -93,9 +95,14 @@ extern orte_ns_proxy_globals_t orte_ns_proxy;
/*
* proxy function prototypes
*/
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames);
int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource);
int orte_ns_proxy_get_node_info(char ***nodename, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid, char **site, char **resource);
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames);
int orte_ns_proxy_get_node_info(char ***nodename, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
int orte_ns_proxy_create_jobid(orte_jobid_t *jobid, opal_list_t *attrs);
@ -127,6 +134,8 @@ int orte_ns_proxy_create_my_name(void);
/*
* Diagnostic functions
*/
int orte_ns_proxy_dump_cells(void);
int orte_ns_proxy_dump_jobs(void);
int orte_ns_proxy_dump_tags(void);

Просмотреть файл

@ -45,7 +45,186 @@
* functions
*/
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames)
int orte_ns_proxy_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
command = ORTE_NS_CREATE_CELLID_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &site, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &resource, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
if (ORTE_NS_CREATE_CELLID_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, cellid, &count, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
return rc;
}
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
OPAL_TRACE(1);
command = ORTE_NS_GET_CELL_INFO_CMD;
cmd = OBJ_NEW(orte_buffer_t);
if (cmd == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_RELEASE(cmd);
answer = OBJ_NEW(orte_buffer_t);
if(answer == NULL) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_NS_GET_CELL_INFO_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, site, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(answer, resource, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_SUCCESS;
}
int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
@ -69,6 +248,12 @@ int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnode
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
return rc;
}
count = opal_argv_count(nodenames);
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &count, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
@ -138,7 +323,8 @@ int orte_ns_proxy_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnode
return ORTE_SUCCESS;
}
int orte_ns_proxy_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
int orte_ns_proxy_get_node_info(char ***nodenames, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
{
orte_buffer_t* cmd;
orte_buffer_t* answer;
@ -164,6 +350,13 @@ int orte_ns_proxy_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, or
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &cellid, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_dss.pack(cmd, &num_nodes, 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(cmd);

Просмотреть файл

@ -70,6 +70,12 @@ mca_ns_base_component_t mca_ns_proxy_component = {
static mca_ns_base_module_t orte_ns_proxy_module = {
/* init */
orte_ns_proxy_module_init,
/* cell functions */
orte_ns_proxy_create_cellid,
orte_ns_proxy_get_cell_info,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/** node functions */
orte_ns_proxy_create_nodeids,
orte_ns_proxy_get_node_info,
@ -104,6 +110,7 @@ static mca_ns_base_module_t orte_ns_proxy_module = {
/* data type functions */
orte_ns_proxy_define_data_type,
/* diagnostic functions */
orte_ns_proxy_dump_cells,
orte_ns_proxy_dump_jobs,
orte_ns_proxy_dump_tags,
orte_ns_proxy_dump_datatypes,
@ -224,6 +231,17 @@ mca_ns_base_module_t* orte_ns_proxy_init(int *priority)
return NULL;
}
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.cells),
(orte_std_cntr_t)orte_ns_proxy.block_size,
(orte_std_cntr_t)orte_ns_proxy.max_size,
(orte_std_cntr_t)orte_ns_proxy.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_proxy.num_cells = 0;
/* initialize the taglist */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_proxy.tags),

Просмотреть файл

@ -40,6 +40,66 @@
/*
* DIAGNOSTIC functions
*/
int orte_ns_proxy_dump_cells(void)
{
orte_buffer_t cmd;
orte_buffer_t answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count;
int rc;
command = ORTE_NS_DUMP_CELLS_CMD;
OPAL_THREAD_LOCK(&orte_ns_proxy.mutex);
/* dump name service replica cell tracker */
OBJ_CONSTRUCT(&cmd, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
OBJ_DESTRUCT(&cmd);
return rc;
}
if (0 > orte_rml.send_buffer(ORTE_NS_MY_REPLICA, &cmd, ORTE_RML_TAG_NS, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&cmd);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
OBJ_DESTRUCT(&cmd);
OBJ_CONSTRUCT(&answer, orte_buffer_t);
if (0 > orte_rml.recv_buffer(ORTE_NS_MY_REPLICA, &answer, ORTE_RML_TAG_NS)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_COMM_FAILURE;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(&answer, &command, &count, ORTE_NS_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
if (ORTE_NS_DUMP_CELLS_CMD != command) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
OBJ_DESTRUCT(&answer);
return ORTE_ERR_COMM_FAILURE;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&answer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&answer);
return rc;
}
return ORTE_SUCCESS;
}
int orte_ns_proxy_dump_jobs(void)
{
orte_buffer_t cmd;
@ -160,8 +220,8 @@ int orte_ns_proxy_dump_tags(void)
}
/* dump local tag tracker */
opal_output(mca_ns_base_output, "\n\n%s Dump of Local Tag Tracker\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Tag Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_tagitem_t**)(orte_ns_proxy.tags)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_tags &&
i < (orte_ns_proxy.tags)->size; i++) {
@ -235,8 +295,8 @@ int orte_ns_proxy_dump_datatypes(void)
}
/* dump local datatype tracker */
opal_output(mca_ns_base_output, "\n\n%s Dump of Local Datatype Tracker\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(mca_ns_base_output, "\n\n[%lu,%lu,%lu] Dump of Local Datatype Tracker\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
ptr = (orte_ns_proxy_dti_t**)(orte_ns_proxy.dts)->addr;
for (i=0, j=0; j < orte_ns_proxy.num_dts &&
i < (orte_ns_proxy.dts)->size; i++) {

Просмотреть файл

@ -47,6 +47,7 @@ int orte_ns_proxy_get_peers(orte_process_name_t **procs,
orte_buffer_t* answer;
orte_ns_cmd_flag_t command;
orte_std_cntr_t count, nprocs, i;
orte_cellid_t *cptr;
orte_attribute_t *attr;
int rc;
@ -58,12 +59,35 @@ int orte_ns_proxy_get_peers(orte_process_name_t **procs,
*procs = NULL;
*num_procs = 0;
/* check the attributes to see if USE_JOB has been set. If not, then this is
/* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is
* a request for my own job peers - process that one locally
*/
/* if the cell is given AND it matches my own, then we can process this
* quickly. Otherwise, we have to do some more work.
*
* RHC: when we go multi-cell, we need a way to find all the cells upon
* which a job is executing so we can make this work!
*/
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return rc;
}
if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
OPAL_THREAD_UNLOCK(&orte_ns_proxy.mutex);
return ORTE_ERR_NOT_IMPLEMENTED;
}
}
if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) {
/* get my own job peers */
/* get my own job peers, assuming all are on this cell - process here
*
* RHC: This is a bad assumption. When we go multi-cell, we are going to have to process
* get peer requests solely on the HNP since we won't know the cellid otherwise
*/
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -72,6 +96,7 @@ int orte_ns_proxy_get_peers(orte_process_name_t **procs,
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}

Просмотреть файл

@ -0,0 +1,452 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
*/
#include "orte_config.h"
#include <stdio.h>
#include <string.h>
#include "opal/threads/mutex.h"
#include "opal/util/output.h"
#include "opal/util/trace.h"
#include "orte/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/base/base.h"
#include "orte/mca/ns/base/ns_private.h"
#include "ns_replica.h"
/**
* globals
*/
#define NS_REPLICA_MAX_STRING_SIZE 256
/*
* DIAGNOSTIC functions
*/
int orte_ns_replica_dump_cells(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)cell[i]->cell);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
cell[i]->site, cell[i]->resource);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_jobid_tracker_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Jobid Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_jobid_tracker_t**)(orte_ns_replica.jobids)->addr;
for (i=0, j=0; j < orte_ns_replica.num_jobids &&
i < (orte_ns_replica.jobids)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tJobid: %lu\tNext vpid: %lu\n",
(unsigned long)j, (unsigned long)ptr[i]->jobid,
(unsigned long)ptr[i]->next_vpid);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_tags_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_tags_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i;
orte_rml_tag_t j;
orte_ns_replica_tagitem_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service RML Tag Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_tagitem_t**)(orte_ns_replica.tags)->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tTag id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->tag, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_datatypes_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_datatypes_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i, j;
orte_ns_replica_dti_t **ptr;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Datatype Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
ptr = (orte_ns_replica_dti_t**)(orte_ns_replica.dts)->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < (orte_ns_replica.dts)->size; i++) {
if (NULL != ptr[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tDatatype id: %lu\tName: %s\n",
(unsigned long)j, (unsigned long)ptr[i]->id, ptr[i]->name);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* TAG SERVER functions
*/
int orte_ns_replica_assign_rml_tag(orte_rml_tag_t *tag,
char *name)
{
orte_ns_replica_tagitem_t *tagitem, **tags;
orte_std_cntr_t i;
orte_rml_tag_t j;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (NULL != name) {
/* see if this name is already in list - if so, return tag */
tags = (orte_ns_replica_tagitem_t**)orte_ns_replica.tags->addr;
for (i=0, j=0; j < orte_ns_replica.num_tags &&
i < (orte_ns_replica.tags)->size; i++) {
if (NULL != tags[i]) {
j++;
if (tags[i]->name != NULL &&
0 == strcmp(name, tags[i]->name)) { /* found name on list */
*tag = tags[i]->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
}
/* not in list or not provided, so allocate next tag */
*tag = ORTE_RML_TAG_MAX;
/* check if tag is available - need to do this since the tag type
* is probably not going to be a orte_std_cntr_t, so we cannot just rely
* on the pointer_array's size limits to protect us. NOTE: need to
* reserve ORTE_RML_TAG_MAX as an invalid value, so can't let
* num_tags get there
*/
if (ORTE_RML_TAG_MAX-2 < orte_ns_replica.num_tags) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
tagitem = OBJ_NEW(orte_ns_replica_tagitem_t);
if (NULL == tagitem) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.tags, tagitem))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
tagitem->tag = orte_ns_replica.num_tags + ORTE_RML_TAG_DYNAMIC;
(orte_ns_replica.num_tags)++;
if (NULL != name) { /* provided - can look it up later */
tagitem->name = strdup(name);
} else {
tagitem->name = NULL;
}
*tag = tagitem->tag;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* DATA TYPE SERVER functions
*/
int orte_ns_replica_define_data_type(const char *name,
orte_data_type_t *type)
{
orte_ns_replica_dti_t **dti, *dtip;
orte_std_cntr_t i, j;
int rc;
if (NULL == name || 0 < *type) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
dti = (orte_ns_replica_dti_t**)orte_ns_replica.dts->addr;
for (i=0, j=0; j < orte_ns_replica.num_dts &&
i < orte_ns_replica.dts->size; i++) {
if (NULL != dti[i]) {
j++;
if (dti[i]->name != NULL &&
0 == strcmp(name, dti[i]->name)) { /* found name on list */
*type = dti[i]->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
/* not in list or not provided, so allocate next id */
*type = ORTE_DSS_ID_MAX;
/* check if id is available - need to do this since the data type
* is probably not going to be a orte_std_cntr_t, so we cannot just rely
* on the pointer_array's size limits to protect us.
*/
if (ORTE_DSS_ID_MAX-2 < orte_ns_replica.num_dts) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip = OBJ_NEW(orte_ns_replica_dti_t);
if (NULL == dtip) { /* out of memory */
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
dtip->name = strdup(name);
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&i,
orte_ns_replica.dts, dtip))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
dtip->id = orte_ns_replica.num_dts;
(orte_ns_replica.num_dts)++;
*type = dtip->id;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
/*
* NAME functions
*/
int orte_ns_replica_create_my_name(void)
{
orte_jobid_t jobid;
orte_vpid_t vpid;
int rc;
if (ORTE_SUCCESS != (rc = orte_ns.create_jobid(&jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.reserve_range(jobid, 1, &vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
0, jobid, vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -39,6 +39,31 @@ extern "C" {
*/
#define NS_REPLICA_MAX_STRING_SIZE 256
/* class for tracking cellid's */
struct orte_ns_replica_cell_tracker_t {
opal_object_t super;
orte_cellid_t cell;
char *site;
char *resource;
orte_nodeid_t next_nodeid;
orte_pointer_array_t *nodeids;
};
typedef struct orte_ns_replica_cell_tracker_t orte_ns_replica_cell_tracker_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_cell_tracker_t);
/* object for tracking nodeid's */
struct orte_ns_replica_nodeid_tracker_t {
opal_object_t super;
char *nodename;
orte_nodeid_t nodeid;
};
typedef struct orte_ns_replica_nodeid_tracker_t orte_ns_replica_nodeid_tracker_t;
OBJ_CLASS_DECLARATION(orte_ns_replica_nodeid_tracker_t);
/*
* object for tracking vpids and jobids for job families
* This structure is used to track the parent-child relationship between
@ -81,8 +106,8 @@ OBJ_CLASS_DECLARATION(orte_ns_replica_dti_t);
*/
typedef struct {
size_t max_size, block_size;
orte_nodeid_t next_nodeid;
orte_pointer_array_t *nodenames;
orte_cellid_t num_cells;
orte_pointer_array_t *cells;
orte_jobid_t num_jobids;
opal_list_t jobs;
orte_pointer_array_t *tags;
@ -118,11 +143,17 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata);
/*
* NODE FUNCTIONS
* CELL FUNCTIONS
*/
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames);
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource);
int orte_ns_replica_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource);
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames);
int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids);
/*
* JOB FUNCTIONS
@ -164,6 +195,9 @@ int orte_ns_replica_create_my_name(void);
/*
* DIAGNOSTIC FUNCTIONS
*/
int orte_ns_replica_dump_cells(void);
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer);
int orte_ns_replica_dump_jobs(void);
int orte_ns_replica_dump_jobs_fn(orte_buffer_t *buffer);

Просмотреть файл

@ -38,14 +38,141 @@
* functions
*/
int orte_ns_replica_create_cellid(orte_cellid_t *cellid, char *site, char *resource)
{
orte_ns_replica_cell_tracker_t *new_cell, **cell;
int rc;
orte_std_cntr_t i, j, index;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
/* if a valid cellid is given to us, then all we need to do is
* update the descriptive info
*/
if (ORTE_CELLID_INVALID != *cellid) {
/* see if the cell info is already present */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cell[i]->cell == *cellid) {
/* it is here - update the info */
if (NULL != cell[i]->site) {
free(cell[i]->site);
}
if (NULL != cell[i]->resource) {
free(cell[i]->resource);
}
new_cell = cell[i];
goto UPDATE;
}
}
}
/* get here if one isn't already present - create one */
goto NEWSITE;
}
/* check for error */
if (NULL == site || NULL == resource) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_BAD_PARAM;
}
/* is this a known cellid? */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (0 == strcmp(site, cell[i]->site) &&
0 == strcmp(resource, cell[i]->resource)) {
*cellid = cell[i]->cell;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
*cellid = orte_ns_replica.num_cells;
NEWSITE:
/* new cell - check if cellid is available */
if (ORTE_CELLID_MAX-1 < orte_ns_replica.num_cells) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
new_cell = OBJ_NEW(orte_ns_replica_cell_tracker_t);
if (NULL == new_cell) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_pointer_array_add(&index, orte_ns_replica.cells, new_cell))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
(orte_ns_replica.num_cells)++;
new_cell->cell = *cellid;
UPDATE:
new_cell->site = strdup(site);
new_cell->resource = strdup(resource);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_get_cell_info(orte_cellid_t cellid,
char **site, char **resource)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
OPAL_TRACE(1);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
*site = strdup(cell[i]->site);
*resource = strdup(cell[i]->resource);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
}
}
/* it isn't an error to not find the cell - so do NOT
* report it via ORTE_ERROR_LOG
*/
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
}
/*
* NODEID
*/
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes, char **nodenames)
int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nnodes,
orte_cellid_t cellid, char **nodenames)
{
orte_nodeid_t *nds, nid, m;
orte_std_cntr_t k, n, num_nodes;
char **nodes;
orte_ns_replica_cell_tracker_t **cell, *cptr;
orte_ns_replica_nodeid_tracker_t **nodes, *node;
orte_nodeid_t *nds, nid;
orte_std_cntr_t i, j, k, m, n, num_nodes;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
@ -62,20 +189,54 @@ int orte_ns_replica_create_nodeids(orte_nodeid_t **nodeids, orte_std_cntr_t *nno
return ORTE_ERR_OUT_OF_RESOURCE;
}
nodes = (char**)(orte_ns_replica.nodenames->addr);
/** find the cell */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
/** found the specified cell - check to see if nodename has already been
* defined. if so, just return the nodeid. if not, create a new one
*/
cptr = cell[i];
goto PROCESS;
}
}
}
/** get here if we didn't find the cell */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
free(nds);
*nodeids = NULL;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
nodes = (orte_ns_replica_nodeid_tracker_t**)(cptr->nodeids->addr);
for (n=0; n < num_nodes; n++) {
for (k=0, m=0; m < orte_ns_replica.next_nodeid &&
k < (orte_ns_replica.nodenames)->size; k++) {
for (k=0, m=0; m < cptr->next_nodeid &&
k < (cptr->nodeids)->size; k++) {
if (NULL != nodes[k]) {
m++;
if (strcmp(nodenames[n], nodes[k]) == 0) { /** found same name */
nid = m;
if (strcmp(nodenames[n], nodes[k]->nodename) == 0) { /** found same name */
nid = nodes[k]->nodeid;
goto ASSIGN;
}
}
}
/** get here if we don't find this nodename - add it */
nid = orte_ns_replica.next_nodeid++;
/** get here if we don't find this nodename - add one */
node = OBJ_NEW(orte_ns_replica_nodeid_tracker_t);
if (NULL == node) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
free(nds);
*nodeids = NULL;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_OUT_OF_RESOURCE;
}
node->nodename = strdup(nodenames[n]);
node->nodeid = cptr->next_nodeid;
cptr->next_nodeid++;
nid = node->nodeid;
ASSIGN:
nds[n] = nid;
@ -88,13 +249,16 @@ ASSIGN:
return ORTE_SUCCESS;
}
int orte_ns_replica_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
int orte_ns_replica_get_node_info(char ***nodenames, orte_cellid_t cellid,
orte_std_cntr_t num_nodes, orte_nodeid_t *nodeids)
{
char **names;
orte_std_cntr_t n;
char **nodes;
char **names, *nm;
orte_ns_replica_cell_tracker_t **cell, *cptr;
orte_ns_replica_nodeid_tracker_t **nodes;
orte_std_cntr_t i, j, k, m, n;
char *err_name = "NODE_NOT_FOUND";
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
if (0 == num_nodes) {
*nodenames = NULL;
@ -109,16 +273,49 @@ int orte_ns_replica_get_node_info(char ***nodenames, orte_std_cntr_t num_nodes,
}
names[num_nodes] = NULL; /** NULL-terminate the list */
nodes = (char**)(orte_ns_replica.nodenames->addr);
for (n=0; n < num_nodes; n++) {
if (nodeids[n] >= orte_ns_replica.next_nodeid) {
names[n] = strdup("invalid nodeid");
} else if (NULL != nodes[nodeids[n]]) {
names[n] = strdup(nodes[nodeids[n]]);
} else {
names[n] = strdup("unknown nodeid");
/** find the cell */
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
if (cellid == cell[i]->cell) {
/** found the specified cell - check to see if nodename has already been
* defined. if so, just return the nodeid. if not, create a new one
*/
cptr = cell[i];
goto PROCESS;
}
}
}
/** get here if we didn't find the cell */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
free(names);
*nodenames = NULL;
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_FOUND;
PROCESS:
nodes = (orte_ns_replica_nodeid_tracker_t**)(cell[i]->nodeids->addr);
for (n=0; n < num_nodes; n++) {
for (k=0, m=0; m < cell[i]->next_nodeid &&
k < (cell[i]->nodeids)->size; k++) {
if (NULL != nodes[k]) {
m++;
if (nodeids[n] == nodes[k]->nodeid) { /** found it */
nm = nodes[k]->nodename;
goto ASSIGN;
}
}
}
/** node not found - set name to error name. Can't set it to NULL since
* the list is a NULL-terminated one
*/
nm = err_name;
ASSIGN:
names[n] = strdup(nm);
}
*nodenames = names;

Просмотреть файл

@ -34,6 +34,67 @@
extern "C" {
#endif
/*** CELLID ***/
/* constructor - used to initialize state of cell_tracker instance */
static void orte_ns_replica_cell_tracker_construct(orte_ns_replica_cell_tracker_t* cell_tracker)
{
cell_tracker->cell = ORTE_CELLID_INVALID;
cell_tracker->site = NULL;
cell_tracker->resource = NULL;
cell_tracker->next_nodeid = 0;
orte_pointer_array_init(&(cell_tracker->nodeids),
orte_ns_replica.block_size,
orte_ns_replica.max_size,
orte_ns_replica.block_size);
}
/* destructor - used to free any resources held by instance */
static void orte_ns_replica_cell_tracker_destructor(orte_ns_replica_cell_tracker_t* cell_tracker)
{
orte_std_cntr_t i, j;
orte_ns_replica_nodeid_tracker_t **nodeid;
if (NULL != cell_tracker->site) free(cell_tracker->site);
if (NULL != cell_tracker->resource) free(cell_tracker->resource);
nodeid = (orte_ns_replica_nodeid_tracker_t**)(cell_tracker->nodeids)->addr;
for (i=0, j=0; j < cell_tracker->next_nodeid &&
i < (cell_tracker->nodeids)->size; i++) {
if (NULL != nodeid[i]) {
j++;
OBJ_RELEASE(nodeid[i]);
}
}
OBJ_RELEASE(cell_tracker->nodeids);
}
/* define instance of opal_class_t */
OBJ_CLASS_INSTANCE(orte_ns_replica_cell_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_cell_tracker_construct, /* constructor */
orte_ns_replica_cell_tracker_destructor); /* destructor */
/** NODEID */
static void orte_ns_replica_nodeid_tracker_construct(orte_ns_replica_nodeid_tracker_t *ptr)
{
ptr->nodeid = ORTE_NODEID_INVALID;
ptr->nodename = NULL;
}
static void orte_ns_replica_nodeid_tracker_destructor(orte_ns_replica_nodeid_tracker_t *ptr)
{
if (NULL != ptr->nodename) free(ptr->nodename);
}
OBJ_CLASS_INSTANCE(orte_ns_replica_nodeid_tracker_t, /* type name */
opal_object_t, /* parent "class" name */
orte_ns_replica_nodeid_tracker_construct, /* constructor */
orte_ns_replica_nodeid_tracker_destructor); /* destructor */
/*** JOBITEM ***/
/* constructor - used to initialize state of jobitem instance */
static void orte_ns_replica_jobitem_construct(orte_ns_replica_jobitem_t *ptr)

Просмотреть файл

@ -77,6 +77,12 @@ orte_ns_replica_finalize /* module shutdown */
static mca_ns_base_module_t orte_ns_replica_module = {
/* init */
orte_ns_replica_module_init,
/* cell functions */
orte_ns_replica_create_cellid,
orte_ns_replica_get_cell_info,
orte_ns_base_get_cellid_string,
orte_ns_base_convert_cellid_to_string,
orte_ns_base_convert_string_to_cellid,
/** node functions */
orte_ns_replica_create_nodeids,
orte_ns_replica_get_node_info,
@ -111,6 +117,7 @@ static mca_ns_base_module_t orte_ns_replica_module = {
/* data type functions */
orte_ns_replica_define_data_type,
/* diagnostic functions */
orte_ns_replica_dump_cells,
orte_ns_replica_dump_jobs,
orte_ns_replica_dump_tags,
orte_ns_replica_dump_datatypes,
@ -188,15 +195,15 @@ mca_ns_base_module_t* orte_ns_replica_init(int *priority)
*priority = 50;
/* initialize the node tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.nodenames),
/* initialize the cell info tracker */
if (ORTE_SUCCESS != (rc = orte_pointer_array_init(&(orte_ns_replica.cells),
(orte_std_cntr_t)orte_ns_replica.block_size,
(orte_std_cntr_t)orte_ns_replica.max_size,
(orte_std_cntr_t)orte_ns_replica.block_size))) {
ORTE_ERROR_LOG(rc);
return NULL;
}
orte_ns_replica.next_nodeid = 0;
orte_ns_replica.num_cells = 0;
/* initialize the job tracking system */
OBJ_CONSTRUCT(&orte_ns_replica.jobs, opal_list_t);
@ -258,25 +265,22 @@ int orte_ns_replica_module_init(void)
*/
int orte_ns_replica_finalize(void)
{
char **cptr;
orte_ns_replica_cell_tracker_t **cptr;
opal_list_item_t *item;
orte_ns_replica_tagitem_t **tag;
orte_ns_replica_dti_t **dti;
orte_std_cntr_t i;
orte_nodeid_t j;
/* free all tracking storage, but only if this component was initialized */
if (initialized) {
cptr = (char**)(orte_ns_replica.nodenames)->addr;
for (i=0, j=0; j < orte_ns_replica.next_nodeid &&
i < (orte_ns_replica.nodenames)->size; i++) {
cptr = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0; i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cptr[i]) {
j++;
free(cptr[i]);
OBJ_RELEASE(cptr[i]);
}
}
OBJ_RELEASE(orte_ns_replica.nodenames);
OBJ_RELEASE(orte_ns_replica.cells);
while (NULL != (item = opal_list_remove_first(&orte_ns_replica.jobs))) {
OBJ_RELEASE(item);

Просмотреть файл

@ -37,6 +37,72 @@
/*
* DIAGNOSTIC functions
*/
int orte_ns_replica_dump_cells(void)
{
orte_buffer_t buffer;
int rc;
OBJ_CONSTRUCT(&buffer, orte_buffer_t);
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&buffer))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns_base_print_dump(&buffer))) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&buffer);
return rc;
}
OBJ_DESTRUCT(&buffer);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_cells_fn(orte_buffer_t *buffer)
{
orte_std_cntr_t i;
orte_cellid_t j;
orte_ns_replica_cell_tracker_t **cell;
char tmp_out[NS_REPLICA_MAX_STRING_SIZE], *tmp;
int rc;
OPAL_THREAD_LOCK(&orte_ns_replica.mutex);
tmp = tmp_out;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Dump of Name Service Cell Tracker\n");
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
cell = (orte_ns_replica_cell_tracker_t**)(orte_ns_replica.cells)->addr;
for (i=0, j=0; j < orte_ns_replica.num_cells &&
i < (orte_ns_replica.cells)->size; i++) {
if (NULL != cell[i]) {
j++;
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "Num: %lu\tCell: %lu\n",
(unsigned long)j, (unsigned long)cell[i]->cell);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
snprintf(tmp, NS_REPLICA_MAX_STRING_SIZE, "\tSite: %s\n\tResource: %s\n",
cell[i]->site, cell[i]->resource);
if (ORTE_SUCCESS != (rc = orte_dss.pack(buffer, &tmp, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
}
}
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_SUCCESS;
}
int orte_ns_replica_dump_jobs(void)
{
orte_buffer_t buffer;

Просмотреть файл

@ -41,6 +41,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
{
orte_std_cntr_t i, isave, npeers;
orte_jobid_t *jptr;
orte_cellid_t *cptr;
orte_attribute_t *attr;
orte_ns_replica_jobitem_t *job_info, *child;
opal_list_item_t *item;
@ -55,12 +56,31 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
*procs = NULL;
*num_procs = 0;
/* check the attributes to see if USE_JOB has been set. If not, then this is
/* check the attributes to see if USE_JOB or USE_CELL has been set. If not, then this is
* a request for my own job peers - process that one locally
*/
/* if the cell is given AND it matches my own, then we can process this
* quickly. Otherwise, we have to do some more work.
*
* RHC: when we go multi-cell, we need a way to find all the cells upon
* which a job is executing so we can make this work!
*/
if (NULL != (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_CELL))) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, attr->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return rc;
}
if (*cptr != ORTE_PROC_MY_NAME->cellid && *cptr != ORTE_CELLID_WILDCARD) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
OPAL_THREAD_UNLOCK(&orte_ns_replica.mutex);
return ORTE_ERR_NOT_IMPLEMENTED;
}
}
if (NULL == (attr = orte_rmgr.find_attribute(attrs, ORTE_NS_USE_JOBID))) {
/* get my own job peers */
/* get my own job peers, assuming all are on this cell */
*procs = (orte_process_name_t*)malloc(orte_process_info.num_procs * sizeof(orte_process_name_t));
if (NULL == *procs) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
@ -69,6 +89,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
}
for (i=0; i < orte_process_info.num_procs; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = ORTE_PROC_MY_NAME->jobid;
(*procs)[i].vpid = orte_process_info.vpid_start + i;
}
@ -130,6 +151,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
while (NULL != (item = opal_list_remove_first(&peerlist))) {
child = (orte_ns_replica_jobitem_t*)item;
for (i=0; i < child->next_vpid; i++) {
(*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i+isave].jobid = child->jobid;
(*procs)[i+isave].vpid = i;
}
@ -164,6 +186,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
/* populate it, starting with the specified job followed by its children */
for (i=0; i < job_info->next_vpid; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = *jptr;
(*procs)[i].vpid = i;
}
@ -173,6 +196,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
item = opal_list_get_next(item)) {
child = (orte_ns_replica_jobitem_t*)item;
for (i=0; i < child->next_vpid; i++) {
(*procs)[i+isave].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i+isave].jobid = child->jobid;
(*procs)[i+isave].vpid = i;
}
@ -196,6 +220,7 @@ int orte_ns_replica_get_peers(orte_process_name_t **procs,
}
for (i=0; i < job_info->next_vpid; i++) {
(*procs)[i].cellid = ORTE_PROC_MY_NAME->cellid;
(*procs)[i].jobid = *jptr;
(*procs)[i].vpid = i;
}
@ -367,7 +392,8 @@ int orte_ns_replica_create_my_name(void)
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name), jobid, vpid))) {
if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
0, jobid, vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -58,14 +58,15 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
orte_buffer_t answer, error_answer;
orte_ns_cmd_flag_t command;
opal_list_t attrs;
orte_cellid_t cell;
orte_jobid_t job, root, *descendants;
orte_vpid_t startvpid, range;
char *tagname;
char *tagname, *site, *resource;
orte_rml_tag_t oob_tag;
orte_data_type_t type;
orte_std_cntr_t count, nprocs, nret;
orte_process_name_t *procs;
int rc=ORTE_SUCCESS;
int rc=ORTE_SUCCESS, ret;
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &command, &count, ORTE_NS_CMD))) {
@ -81,6 +82,69 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
}
switch (command) {
case ORTE_NS_CREATE_CELLID_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &site, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &resource, &count, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
rc = orte_ns_replica_create_cellid(&cell, site, resource);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &cell, 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_GET_CELL_INFO_CMD:
count = 1;
if (ORTE_SUCCESS != (rc = orte_dss.unpack(buffer, &cell, &count, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
rc = ORTE_ERR_BAD_PARAM;
goto RETURN_ERROR;
}
site = NULL;
resource = NULL;
rc = orte_ns_replica_get_cell_info(cell, &site, &resource);
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &site, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(&answer, &resource, 1, ORTE_STRING))) {
ORTE_ERROR_LOG(ret);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_CREATE_NODEID_CMD:
case ORTE_NS_GET_NODE_INFO_CMD:
ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
@ -390,6 +454,17 @@ void orte_ns_replica_recv(int status, orte_process_name_t* sender,
}
break;
case ORTE_NS_DUMP_CELLS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_cells_fn(&answer))) {
ORTE_ERROR_LOG(rc);
goto RETURN_ERROR;
}
if (0 > orte_rml.send_buffer(sender, &answer, tag, 0)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
goto RETURN_ERROR;
}
break;
case ORTE_NS_DUMP_JOBIDS_CMD:
if (ORTE_SUCCESS != (rc = orte_ns_replica_dump_jobs_fn(&answer))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -677,8 +677,8 @@ orte_odls_bproc_launch_local_procs(orte_gpr_notify_data_t *data,)
child = (odls_bproc_child_t *) item;
if(0 < mca_odls_bproc_component.debug) {
opal_output(0, "orte_odls_bproc_launch: setting up io for "
"%s proc rank %ld\n",
ORTE_NAME_PRINT((child->name)),
"[%ld,%ld,%ld] proc rank %ld\n",
ORTE_NAME_ARGS((child->name)),
(long)child->name->vpid);
}
/* only setup to forward stdin if it is rank 0, otherwise connect

Просмотреть файл

@ -530,8 +530,8 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: working on job %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)job);
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job);
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
@ -547,8 +547,8 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
/* preserve the pointer to the next item in list in case we release it */
next = opal_list_get_next(item);
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: checking child process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
/* do we have a child from the specified job? Because the
* job could be given as a WILDCARD value, we must use
@ -565,8 +565,8 @@ int orte_odls_default_kill_local_procs(orte_jobid_t job, bool set_state)
* to do to it
*/
if (!child->alive) {
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: child %s is not alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child [%ld,%ld,%ld] is not alive",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
/* ensure, though, that the state is terminated so we don't lockup if
* the proc never started
*/
@ -702,8 +702,8 @@ GOTCHILD:
exception is detected and handled (in which case this unpublish
request will be ignored/discarded. */
opal_output(orte_odls_globals.output,
"odls: pid %ld corresponds to %s\n",
(long) pid, ORTE_NAME_PRINT(child->name));
"odls: pid %ld corresponds to [%lu,%lu,%lu]\n",
(long) pid, ORTE_NAME_ARGS(child->name));
if (0 == child->name->vpid) {
rc = orte_iof.iof_unpublish(child->name, ORTE_NS_CMP_ALL,
ORTE_IOF_STDIN);
@ -751,20 +751,20 @@ GOTCHILD:
/* the abort file must exist - there is nothing in it we need. It's
* meer existence indicates that an abnormal termination occurred
*/
opal_output(orte_odls_globals.output, "odls: child %s died by abort",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
ORTE_NAME_ARGS(child->name));
aborted = true;
free(abort_file);
} else {
opal_output(orte_odls_globals.output, "odls: child process %s terminated normally",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",
ORTE_NAME_ARGS(child->name));
}
} else {
/* the process was terminated with a signal! That's definitely
* abnormal, so indicate that condition
*/
opal_output(orte_odls_globals.output, "odls: child process %s terminated with signal",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",
ORTE_NAME_ARGS(child->name));
aborted = true;
}
@ -1419,6 +1419,7 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
filem_request = OBJ_NEW(orte_filem_base_request_t);
filem_request->num_procs = 1;
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
filem_request->proc_name[0].cellid = orte_process_info.gpr_replica->cellid;
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
if(app_item->app_context->preload_binary) {
@ -1517,8 +1518,8 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
* If it has been launched, then do nothing
*/
if (child->alive) {
opal_output(orte_odls_globals.output, "odls: child %s is already alive",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is already alive",
ORTE_NAME_ARGS(child->name));
continue;
}
@ -1527,13 +1528,13 @@ int orte_odls_default_launch_local_procs(orte_gpr_notify_data_t *data)
* the dss.compare function to check for equality.
*/
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
opal_output(orte_odls_globals.output, "odls: child %s is not in job %ld being launched",
ORTE_NAME_PRINT(child->name), (long)job);
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is not in job %ld being launched",
ORTE_NAME_ARGS(child->name), (long)job);
continue;
}
opal_output(orte_odls_globals.output, "odls: preparing to launch child %s",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]",
ORTE_NAME_ARGS(child->name));
/* find the indicated app_context in the list */
for (item2 = opal_list_get_first(&app_context_list);
@ -1710,8 +1711,8 @@ int orte_odls_default_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, o
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
continue;
}
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child %s",
(unsigned long)tag, ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child [%ld, %ld, %ld]",
(unsigned long)tag, ORTE_NAME_ARGS(child->name));
/* if so, send the message */
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);

Просмотреть файл

@ -248,8 +248,8 @@ static int orte_odls_process_kill_local_procs(orte_jobid_t job, bool set_state)
OBJ_CONSTRUCT(&procs_killed, opal_list_t);
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: working on job %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)job);
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: working on job %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)job);
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
@ -265,8 +265,8 @@ static int orte_odls_process_kill_local_procs(orte_jobid_t job, bool set_state)
/* preserve the pointer to the next item in list in case we release it */
next = opal_list_get_next(item);
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: checking child process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: checking child process [%ld,%ld,%ld]",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
/* do we have a child from the specified job? Because the
* job could be given as a WILDCARD value, we must use
@ -283,8 +283,8 @@ static int orte_odls_process_kill_local_procs(orte_jobid_t job, bool set_state)
* to do to it
*/
if (!child->alive) {
opal_output(orte_odls_globals.output, "%s odls_kill_local_proc: child %s is not alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "[%ld,%ld,%ld] odls_kill_local_proc: child [%ld,%ld,%ld] is not alive",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(child->name));
/* ensure, though, that the state is terminated so we don't lockup if
* the proc never started
*/
@ -418,8 +418,8 @@ GOTCHILD:
exception is detected and handled (in which case this unpublish
request will be ignored/discarded. */
opal_output(orte_odls_globals.output,
"odls: pid %ld corresponds to %s\n",
(long) pid, ORTE_NAME_PRINT(child->name));
"odls: pid %ld corresponds to [%lu,%lu,%lu]\n",
(long) pid, ORTE_NAME_ARGS(child->name));
#if 0
if (0 == child->name->vpid) {
rc = orte_iof.iof_unpublish(child->name, ORTE_NS_CMP_ALL,
@ -461,20 +461,20 @@ GOTCHILD:
/* the abort file must exist - there is nothing in it we need. It's
* meer existence indicates that an abnormal termination occurred
*/
opal_output(orte_odls_globals.output, "odls: child %s died by abort",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] died by abort",
ORTE_NAME_ARGS(child->name));
aborted = true;
free(abort_file);
} else {
opal_output(orte_odls_globals.output, "odls: child process %s terminated normally",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated normally",
ORTE_NAME_ARGS(child->name));
}
} else {
/* the process was terminated with a signal! That's definitely
* abnormal, so indicate that condition
*/
opal_output(orte_odls_globals.output, "odls: child process %s terminated with signal",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child process [%ld,%ld,%ld] terminated with signal",
ORTE_NAME_ARGS(child->name));
aborted = true;
}
@ -1007,6 +1007,7 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
filem_request = OBJ_NEW(orte_filem_base_request_t);
filem_request->num_procs = 1;
filem_request->proc_name = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * filem_request->num_procs);
filem_request->proc_name[0].cellid = orte_process_info.gpr_replica->cellid;
filem_request->proc_name[0].jobid = orte_process_info.gpr_replica->jobid;
filem_request->proc_name[0].vpid = orte_process_info.gpr_replica->vpid;
if(app_item->app_context->preload_binary) {
@ -1105,8 +1106,8 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
* If it has been launched, then do nothing
*/
if (child->alive) {
opal_output(orte_odls_globals.output, "odls: child %s is already alive",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is already alive",
ORTE_NAME_ARGS(child->name));
continue;
}
@ -1115,13 +1116,13 @@ static int orte_odls_process_launch_local_procs(orte_gpr_notify_data_t *data)
* the dss.compare function to check for equality.
*/
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
opal_output(orte_odls_globals.output, "odls: child %s is not in job %ld being launched",
ORTE_NAME_PRINT(child->name), (long)job);
opal_output(orte_odls_globals.output, "odls: child [%ld,%ld,%ld] is not in job %ld being launched",
ORTE_NAME_ARGS(child->name), (long)job);
continue;
}
opal_output(orte_odls_globals.output, "odls: preparing to launch child %s",
ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: preparing to launch child [%ld, %ld, %ld]",
ORTE_NAME_ARGS(child->name));
/* find the indicated app_context in the list */
for (item2 = opal_list_get_first(&app_context_list);
@ -1272,8 +1273,8 @@ int orte_odls_process_deliver_message(orte_jobid_t job, orte_buffer_t *buffer, o
if (ORTE_EQUAL != orte_dss.compare(&job, &(child->name->jobid), ORTE_JOBID)) {
continue;
}
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child %s",
(unsigned long)tag, ORTE_NAME_PRINT(child->name));
opal_output(orte_odls_globals.output, "odls: sending message to tag %lu on child [%ld, %ld, %ld]",
(unsigned long)tag, ORTE_NAME_ARGS(child->name));
/* if so, send the message */
rc = orte_rml.send_buffer(child->name, buffer, tag, 0);

Просмотреть файл

@ -147,7 +147,7 @@ int mca_oob_xcast_nb(orte_jobid_t job,
DONE:
if (orte_timing) {
gettimeofday(&stop, NULL);
opal_output(0, "xcast_nb %s: time %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_output(0, "xcast_nb [%ld,%ld,%ld]: time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
(stop.tv_usec - start.tv_usec)));
}
@ -226,7 +226,7 @@ DONE:
if (orte_timing) {
gettimeofday(&stop, NULL);
opal_output(0, "xcast %s: time %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_output(0, "xcast [%ld,%ld,%ld]: time %ld usec", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME),
(long int)((stop.tv_sec - start.tv_sec)*1000000 +
(stop.tv_usec - start.tv_usec)));
}
@ -314,11 +314,12 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
}
if (orte_timing) {
opal_output(0, "xcast %s: mode binomial buffer size %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
opal_output(0, "xcast [%ld,%ld,%ld]: mode binomial buffer size %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
}
/* start setting up the target recipients */
target.cellid = ORTE_PROC_MY_NAME->cellid;
target.jobid = 0;
/* compute the bitmap */
@ -358,12 +359,13 @@ static int mca_oob_xcast_binomial_tree(orte_jobid_t job,
orte_oob_xcast_num_active += binomial_xcast_num_active;
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
target.cellid = ORTE_PROC_MY_NAME->cellid;
target.jobid = 0;
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
peer = rank | mask;
if (peer < size) {
target.vpid = (orte_vpid_t)peer;
opal_output(mca_oob_base_output, "%s xcast to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&target));
opal_output(mca_oob_base_output, "[%ld,%ld,%ld] xcast to [%ld,%ld,%ld]", ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&target));
if (0 > (rc = mca_oob_send_packed_nb(&target, buf, ORTE_RML_TAG_ORTED_ROUTED,
0, mca_oob_xcast_send_cb, NULL))) {
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
@ -450,8 +452,8 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
}
if (orte_timing) {
opal_output(0, "xcast %s: mode linear buffer size %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
opal_output(0, "xcast [%ld,%ld,%ld]: mode linear buffer size %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
}
/* get the number of daemons out there */
@ -486,6 +488,7 @@ static int mca_oob_xcast_linear(orte_jobid_t job,
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
/* send the message to each daemon as fast as we can */
dummy.cellid = ORTE_PROC_MY_NAME->cellid;
dummy.jobid = 0;
for (i=0; i < range; i++) {
if (ORTE_PROC_MY_NAME->vpid != i) { /* don't send to myself */
@ -541,8 +544,8 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
OBJ_DESTRUCT(&attrs);
if (orte_timing) {
opal_output(0, "xcast %s: mode direct buffer size %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buffer->bytes_used);
opal_output(0, "xcast [%ld,%ld,%ld]: mode direct buffer size %ld",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), (long)buffer->bytes_used);
}
/* we have to account for all of the messages we are about to send
@ -555,7 +558,7 @@ static int mca_oob_xcast_direct(orte_jobid_t job,
OPAL_THREAD_UNLOCK(&orte_oob_xcast_mutex);
for(i=0; i<n; i++) {
opal_output(mca_oob_base_output, "oob_xcast: sending to %s", ORTE_NAME_PRINT(peers+i));
opal_output(mca_oob_base_output, "oob_xcast: sending to [%ld,%ld,%ld]", ORTE_NAME_ARGS(peers+i));
if (0 > (rc = mca_oob_send_packed_nb(peers+i, buffer, tag, 0, mca_oob_xcast_send_cb, NULL))) {
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);

Просмотреть файл

@ -460,8 +460,8 @@ static void mca_oob_tcp_accept(int incoming_sd)
/* log the accept */
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s mca_oob_tcp_accept: %s:%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_accept: %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
opal_net_get_hostname((struct sockaddr*) &addr),
opal_net_get_port((struct sockaddr*) &addr));
}
@ -672,8 +672,8 @@ static void* mca_oob_tcp_listen_thread(opal_object_t *obj)
}
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_thread: (%d, %d) %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
item->fd, opal_socket_errno,
inet_ntoa(item->addr.sin_addr),
item->addr.sin_port);
@ -743,8 +743,8 @@ static int mca_oob_tcp_listen_progress(void)
/* log the accept */
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s mca_oob_tcp_listen_progress: %s:%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_listen_progress: %s:%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
inet_ntoa(item->addr.sin_addr),
item->addr.sin_port);
}
@ -851,9 +851,9 @@ static void mca_oob_tcp_recv_probe(int sd, mca_oob_tcp_hdr_t* hdr)
int retval = send(sd, (char *)ptr+cnt, sizeof(mca_oob_tcp_hdr_t)-cnt, 0);
if(retval < 0) {
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_probe: send() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(hdr->msg_src)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_probe: send() failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(hdr->msg_src)),
strerror(opal_socket_errno),
opal_socket_errno);
CLOSE_THE_SOCKET(sd);
@ -877,13 +877,13 @@ static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr)
/* now set socket up to be non-blocking */
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
opal_output(0, "%s mca_oob_tcp_recv_handler: fcntl(F_GETFL) failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: fcntl(F_GETFL) failed: %s (%d)",
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
} else {
flags |= O_NONBLOCK;
if(fcntl(sd, F_SETFL, flags) < 0) {
opal_output(0, "%s mca_oob_tcp_recv_handler: fcntl(F_SETFL) failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: fcntl(F_SETFL) failed: %s (%d)",
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
}
}
@ -898,24 +898,25 @@ static void mca_oob_tcp_recv_connect(int sd, mca_oob_tcp_hdr_t* hdr)
if (ORTE_SUCCESS != orte_ns.reserve_range(hdr->msg_src.jobid, 1, &hdr->msg_src.vpid)) {
return;
}
hdr->msg_src.cellid = ORTE_PROC_MY_NAME->cellid;
}
/* lookup the corresponding process */
peer = mca_oob_tcp_peer_lookup(&hdr->msg_src);
if(NULL == peer) {
opal_output(0, "%s mca_oob_tcp_recv_handler: unable to locate peer",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: unable to locate peer",
ORTE_NAME_ARGS(orte_process_info.my_name));
CLOSE_THE_SOCKET(sd);
return;
}
/* is the peer instance willing to accept this connection */
if(mca_oob_tcp_peer_accept(peer, sd) == false) {
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
opal_output(0, "%s-%s mca_oob_tcp_recv_handler: "
"rejected connection from %s connection state %d",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_PRINT(&(hdr->msg_src)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_recv_handler: "
"rejected connection from [%lu,%lu,%lu] connection state %d",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
ORTE_NAME_ARGS(&(hdr->msg_src)),
peer->peer_state);
}
CLOSE_THE_SOCKET(sd);
@ -954,15 +955,15 @@ static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
while((rc = recv(sd, (char *)&hdr, sizeof(hdr), 0)) != sizeof(hdr)) {
if(rc >= 0) {
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
opal_output(0, "%s mca_oob_tcp_recv_handler: peer closed connection",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: peer closed connection",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
CLOSE_THE_SOCKET(sd);
return;
}
if(opal_socket_errno != EINTR) {
opal_output(0, "%s mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: recv() failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name), strerror(opal_socket_errno), opal_socket_errno);
CLOSE_THE_SOCKET(sd);
return;
}
@ -978,8 +979,8 @@ static void mca_oob_tcp_recv_handler(int sd, short flags, void* user)
mca_oob_tcp_recv_connect(sd, &hdr);
break;
default:
opal_output(0, "%s mca_oob_tcp_recv_handler: invalid message type: %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name), hdr.msg_type);
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_recv_handler: invalid message type: %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), hdr.msg_type);
CLOSE_THE_SOCKET(sd);
break;
}
@ -1079,8 +1080,8 @@ void mca_oob_tcp_registry_callback(
mca_oob_tcp_peer_t* peer;
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s mca_oob_tcp_registry_callback\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
/* process the callback */
@ -1115,15 +1116,15 @@ void mca_oob_tcp_registry_callback(
addr = mca_oob_tcp_addr_unpack(&buffer);
OBJ_DESTRUCT(&buffer);
if(NULL == addr) {
opal_output(0, "%s mca_oob_tcp_registry_callback: unable to unpack peer address\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback: unable to unpack peer address\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
continue;
}
if(mca_oob_tcp_component.tcp_debug > OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s mca_oob_tcp_registry_callback: received peer %s\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(addr->addr_name)));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_registry_callback: received peer [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(addr->addr_name)));
}
/* check for existing cache entry */
@ -1132,8 +1133,8 @@ void mca_oob_tcp_registry_callback(
if(NULL != existing && ORTE_EQUAL != orte_dss.compare(ORTE_PROC_MY_NAME, &addr->addr_name, ORTE_NAME)) {
/* need to update existing entry - but don't update our own entry! */
if(mca_oob_tcp_component.tcp_debug > OOB_TCP_DEBUG_INFO) {
opal_output( 0, "%s Received OOB update for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&addr->addr_name) );
opal_output( 0, "[%ld,%ld,%ld] Received OOB update for [%ld,%ld,%ld]",
ORTE_NAME_ARGS(ORTE_PROC_MY_NAME), ORTE_NAME_ARGS(&addr->addr_name) );
}
orte_hash_table_set_proc(&mca_oob_tcp_component.tcp_peer_names, &addr->addr_name, addr);
OBJ_RELEASE(addr);
@ -1216,8 +1217,8 @@ int mca_oob_tcp_init(void)
16); /* increment to grow by */
opal_progress_register(mca_oob_tcp_listen_progress);
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s accepting connections via listen thread",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] accepting connections via listen thread",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
} else {
/* fix up the listen_type, since we might have been in thread,
@ -1245,8 +1246,8 @@ int mca_oob_tcp_init(void)
}
#endif
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s accepting connections via event library",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] accepting connections via event library",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
}
@ -1269,8 +1270,8 @@ int mca_oob_tcp_register_subscription(orte_jobid_t jobid, char *trigger)
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
opal_output(0, "%s mca_oob_tcp_init: calling orte_gpr.subscribe\n",
ORTE_NAME_PRINT(orte_process_info.my_name));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_init: calling orte_gpr.subscribe\n",
ORTE_NAME_ARGS(orte_process_info.my_name));
}
if (ORTE_SUCCESS != (rc = orte_schema.get_std_subscription_name(&sub_name,
@ -1367,9 +1368,9 @@ int mca_oob_tcp_register_contact_info(void)
tmp2 = strrchr(tmp, '/') + 1;
tmp3 = strrchr(tmp, ':');
if(NULL == tmp2 || NULL == tmp3) {
opal_output(0, "%s mca_oob_tcp_init: invalid address \'%s\' "
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_init: invalid address \'%s\' "
"returned for selected oob interfaces.\n",
ORTE_NAME_PRINT(orte_process_info.my_name), tmp);
ORTE_NAME_ARGS(orte_process_info.my_name), tmp);
ORTE_ERROR_LOG(ORTE_ERROR);
free(tmp);
free(bo.bytes);

Просмотреть файл

@ -257,9 +257,9 @@ bool mca_oob_tcp_msg_send_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK)
return false;
else {
opal_output(0, "%s-%s mca_oob_tcp_msg_send_handler: writev failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_send_handler: writev failed: %s (%d)",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
mca_oob_tcp_peer_close(peer);
@ -305,9 +305,9 @@ bool mca_oob_tcp_msg_recv_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
if(msg->msg_hdr.msg_size > 0) {
msg->msg_rwbuf = malloc(msg->msg_hdr.msg_size);
if(NULL == msg->msg_rwbuf) {
opal_output(0, "%s-%s mca_oob_tcp_msg_recv_handler: malloc(%d) failed\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv_handler: malloc(%d) failed\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
msg->msg_hdr.msg_size);
mca_oob_tcp_peer_close(peer);
return false;
@ -321,9 +321,9 @@ bool mca_oob_tcp_msg_recv_handler(mca_oob_tcp_msg_t* msg, struct mca_oob_tcp_pee
msg->msg_rwnum = 0;
}
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s-%s mca_oob_tcp_msg_recv_handler: size %lu\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv_handler: size %lu\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
(unsigned long)(msg->msg_hdr.msg_size) );
}
}
@ -362,9 +362,9 @@ static bool mca_oob_tcp_msg_recv(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pee
under UNIX/Linux environments */
else if (opal_socket_errno == EAGAIN || opal_socket_errno == EWOULDBLOCK)
return false;
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: readv failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv: readv failed: %s (%d)",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
mca_oob_tcp_peer_close(peer);
@ -372,9 +372,9 @@ static bool mca_oob_tcp_msg_recv(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* pee
return false;
} else if (rc == 0) {
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT_FAIL) {
opal_output(0, "%s-%s mca_oob_tcp_msg_recv: peer closed connection",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)));
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_msg_recv: peer closed connection",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)));
}
mca_oob_tcp_peer_close(peer);
mca_oob_call_exception_handlers(&peer->peer_name, MCA_OOB_PEER_DISCONNECTED);
@ -417,9 +417,11 @@ void mca_oob_tcp_msg_recv_complete(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* p
mca_oob_tcp_msg_data(msg,peer);
break;
default:
opal_output(0, "%s mca_oob_tcp_msg_recv_complete: invalid message type: %d from peer %s\n",
ORTE_NAME_PRINT(orte_process_info.my_name), msg->msg_hdr.msg_type,
ORTE_NAME_PRINT(&peer->peer_name));
opal_output(0, "[%lu,%lu,%lu] mca_oob_tcp_msg_recv_complete: invalid message type: %d from peer [%lu,%lu,%lu]\n",
ORTE_NAME_ARGS(orte_process_info.my_name), msg->msg_hdr.msg_type,
(long)(peer->peer_name.cellid),
(long)(peer->peer_name.jobid),
(long)(peer->peer_name.vpid));
MCA_OOB_TCP_MSG_RETURN(msg);
break;
}

Просмотреть файл

@ -312,20 +312,20 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
do {
/* pick an address in round-robin fashion from the list exported by the peer */
if(ORTE_SUCCESS != (rc = mca_oob_tcp_addr_get_next(peer->peer_addr, (struct sockaddr_storage*) &inaddr))) {
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
"mca_oob_tcp_addr_get_next failed with error=%d",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
rc);
mca_oob_tcp_peer_close(peer);
return ORTE_ERR_UNREACH;
}
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
"connecting port %d to: %s:%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
/* Bug, FIXME: output tcp6_listen_port for AF_INET6 */
ntohs(mca_oob_tcp_component.tcp_listen_port),
opal_net_get_hostname((struct sockaddr*) &inaddr),
@ -380,10 +380,10 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
return ORTE_SUCCESS;
}
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
"connect to %s:%d failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
opal_net_get_hostname((struct sockaddr*) &inaddr),
opal_net_get_port((struct sockaddr*) &inaddr),
strerror(opal_socket_errno),
@ -408,10 +408,10 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
return ORTE_SUCCESS;
} else {
opal_output(0,
"%s-%s mca_oob_tcp_peer_try_connect: "
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
"mca_oob_tcp_peer_send_connect_ack to %s:%d failed: %s (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
opal_net_get_hostname((struct sockaddr*) &inaddr),
opal_net_get_port((struct sockaddr*) &inaddr),
opal_strerror(rc),
@ -420,10 +420,10 @@ static int mca_oob_tcp_peer_try_connect(mca_oob_tcp_peer_t* peer)
} while(peer->peer_addr->addr_next != 0);
/* None of the interfaces worked.. */
opal_output(0, "%s-%s mca_oob_tcp_peer_try_connect: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_try_connect: "
"connect to %s:%d failed, connecting over all interfaces failed!",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
opal_net_get_hostname((struct sockaddr*) &inaddr),
opal_net_get_port((struct sockaddr*) &inaddr));
mca_oob_tcp_peer_close(peer);
@ -455,9 +455,9 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
#endif
struct timeval tv = { 1,0 };
opal_output(0,
"%s-%s mca_oob_tcp_peer_start_connect: socket() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: socket() failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
mca_oob_tcp_peer_shutdown(peer);
@ -481,17 +481,17 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
/* setup the socket as non-blocking */
if (peer->peer_sd >= 0) {
if((flags = fcntl(peer->peer_sd, F_GETFL, 0)) < 0) {
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
} else {
flags |= O_NONBLOCK;
if(fcntl(peer->peer_sd, F_SETFL, flags) < 0)
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
}
@ -499,16 +499,16 @@ static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
#if OPAL_WANT_IPV6
if (peer->peer6_sd >= 0) {
if((flags = fcntl(peer->peer6_sd, F_GETFL, 0)) < 0) {
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed with errno=%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed with errno=%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
opal_socket_errno);
} else {
flags |= O_NONBLOCK;
if(fcntl(peer->peer6_sd, F_SETFL, flags) < 0)
opal_output(0, "%s-%s mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed with errno=%d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed with errno=%d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
opal_socket_errno);
}
}
@ -548,9 +548,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
/* check connect completion status */
if(getsockopt(sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: getsockopt() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: getsockopt() failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
mca_oob_tcp_peer_close(peer);
@ -573,17 +573,17 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
return;
} else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
struct timeval tv = { 1,0 };
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
"connection failed: %s (%d) - retrying\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(so_error),
so_error);
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
"sending ack, %d",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)), so_error);
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)), so_error);
}
mca_oob_tcp_peer_shutdown(peer);
@ -612,9 +612,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd)
opal_event_add(&peer->peer_recv_event, 0);
#endif
} else {
opal_output(0, "%s-%s mca_oob_tcp_peer_complete_connect: unable to send connect ack.",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)));
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: unable to send connect ack.",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)));
mca_oob_tcp_peer_close(peer);
}
}
@ -673,9 +673,9 @@ static void mca_oob_tcp_peer_connected(mca_oob_tcp_peer_t* peer, int sd)
void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
{
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s-%s mca_oob_tcp_peer_close(%p) sd %d state %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_close(%p) sd %d state %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
(void*)peer,
peer->peer_sd,
peer->peer_state);
@ -683,9 +683,9 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer)
#if OPAL_WANT_IPV6
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0, "%s-%s mca_oob_tcp_peer_close(%p) sd6 %d state %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_close(%p) sd6 %d state %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
(void*)peer,
peer->peer6_sd,
peer->peer_state);
@ -713,9 +713,9 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
mca_oob_tcp_msg_t *msg;
opal_output(0, "%s-%s mca_oob_tcp_peer_shutdown: retries exceeded",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)));
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_shutdown: retries exceeded",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)));
/* There are cases during the initial connection setup where
the peer_send_msg is NULL but there are things in the queue
@ -802,10 +802,10 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd)
struct timeval tv = { 1,0 };
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
opal_output(0,
"%s-%s mca_oob_tcp_peer_recv_connect_ack "
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack "
"connect failed during receive. Restarting (%s).",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno));
}
opal_event_del(&peer->peer_recv_event);
@ -830,11 +830,11 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd)
/* compare the peers name to the expected value */
if (0 != orte_ns.compare_fields(ORTE_NS_CMP_ALL, &peer->peer_name, &hdr.msg_src)) {
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_connect_ack: "
"received unexpected process identifier %s\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_PRINT(&(hdr.msg_src)));
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack: "
"received unexpected process identifier [%ld,%ld,%ld]\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
ORTE_NAME_ARGS(&(hdr.msg_src)));
mca_oob_tcp_peer_close(peer);
return ORTE_ERR_UNREACH;
}
@ -844,7 +844,8 @@ static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer, int sd)
* comparison - we do NOT want wildcard values to return EQUAL
*/
if(orte_process_info.my_name == NULL) {
orte_ns.create_process_name(&orte_process_info.my_name, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
orte_ns.create_process_name(&orte_process_info.my_name,
hdr.msg_dst.cellid, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
} else if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, orte_process_info.my_name, ORTE_NAME_INVALID) == ORTE_EQUAL) {
*orte_process_info.my_name = hdr.msg_dst;
}
@ -872,10 +873,10 @@ static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, int sd, void
/* remote closed connection */
if(retval == 0) {
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_blocking: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_blocking: "
"peer closed connection: peer state %d",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
peer->peer_state);
}
mca_oob_tcp_peer_close(peer);
@ -905,10 +906,10 @@ static int mca_oob_tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, int sd, void
return -1;
} else {
opal_output(0,
"%s-%s mca_oob_tcp_peer_recv_blocking: "
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_blocking: "
"recv() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(errno),
errno);
mca_oob_tcp_peer_close(peer);
@ -934,9 +935,9 @@ static int mca_oob_tcp_peer_send_blocking(mca_oob_tcp_peer_t* peer, int sd, void
int retval = send(sd, (char *)ptr+cnt, size-cnt, 0);
if(retval < 0) {
if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
opal_output(0, "%s-%s mca_oob_tcp_peer_send_blocking: send() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_send_blocking: send() failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
strerror(opal_socket_errno),
opal_socket_errno);
mca_oob_tcp_peer_close(peer);
@ -1010,9 +1011,9 @@ static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
mca_oob_tcp_msg_t* msg;
MCA_OOB_TCP_MSG_ALLOC(msg, rc);
if(NULL == msg) {
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)));
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)));
return;
}
msg->msg_type = MCA_OOB_TCP_UNEXPECTED;
@ -1040,9 +1041,9 @@ static void mca_oob_tcp_peer_recv_handler(int sd, short flags, void* user)
}
default:
{
opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_handler: invalid socket state(%d)",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
peer->peer_state);
mca_oob_tcp_peer_close(peer);
break;
@ -1087,9 +1088,9 @@ static void mca_oob_tcp_peer_send_handler(int sd, short flags, void* user)
break;
}
default:
opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: invalid connection state (%d)",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_send_handler: invalid connection state (%d)",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
peer->peer_state);
opal_event_del(&peer->peer_send_event);
break;
@ -1158,9 +1159,9 @@ static void mca_oob_tcp_peer_dump(mca_oob_tcp_peer_t* peer, const char* msg)
nodelay = 0;
#endif
sprintf(buff, "%s-%s %s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
sprintf(buff, "[%lu,%lu,%lu]-[%lu,%lu,%lu] %s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
msg, src, dst, nodelay, sndbuf, rcvbuf, flags);
opal_output(0, buff);
}
@ -1193,10 +1194,10 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer, int sd)
mca_oob_tcp_peer_event_init(peer);
if(mca_oob_tcp_peer_send_connect_ack(peer, sd) != ORTE_SUCCESS) {
opal_output(0, "%s-%s mca_oob_tcp_peer_accept: "
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_accept: "
"mca_oob_tcp_peer_send_connect_ack failed\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)));
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)));
mca_oob_tcp_peer_close(peer);
OPAL_THREAD_UNLOCK(&peer->peer_lock);
return false;

Просмотреть файл

@ -91,9 +91,9 @@ int mca_oob_tcp_ping(
/* parse uri string */
if(ORTE_SUCCESS != (rc = mca_oob_tcp_parse_uri(uri, &inaddr))) {
opal_output(0,
"%s-%s mca_oob_tcp_ping: invalid uri: %s\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT((orte_process_name_t*)name),
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: invalid uri: %s\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(name),
uri);
return rc;
}
@ -106,9 +106,9 @@ int mca_oob_tcp_ping(
#endif
if (sd < 0) {
opal_output(0,
"%s-%s mca_oob_tcp_ping: socket() failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT((orte_process_name_t*)name),
"[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: socket() failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(name),
strerror(opal_socket_errno),
opal_socket_errno);
return ORTE_ERR_UNREACH;
@ -116,17 +116,17 @@ int mca_oob_tcp_ping(
/* setup the socket as non-blocking */
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
opal_output(0, "%s-%s mca_oob_tcp_ping: fcntl(F_GETFL) failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT((orte_process_name_t*)name),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: fcntl(F_GETFL) failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(name),
strerror(opal_socket_errno),
opal_socket_errno);
} else {
flags |= O_NONBLOCK;
if(fcntl(sd, F_SETFL, flags) < 0) {
opal_output(0, "%s-%s mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT((orte_process_name_t*)name),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(name),
strerror(opal_socket_errno),
opal_socket_errno);
}
@ -154,9 +154,9 @@ int mca_oob_tcp_ping(
/* set socket back to blocking */
flags &= ~O_NONBLOCK;
if(fcntl(sd, F_SETFL, flags) < 0) {
opal_output(0, "%s-%s mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT((orte_process_name_t*)name),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_ping: fcntl(F_SETFL) failed: %s (%d)\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(name),
strerror(opal_socket_errno),
opal_socket_errno);
}

Просмотреть файл

@ -46,9 +46,9 @@ int mca_oob_tcp_recv(
int i, rc = 0, size = 0;
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
opal_output(0, "%s-%s mca_oob_tcp_recv: tag %d\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(peer),
opal_output(0, "[%lu,%lu,%lu]-[%ld,%ld,%ld] mca_oob_tcp_recv: tag %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(peer),
tag);
}
@ -65,9 +65,9 @@ int mca_oob_tcp_recv(
}
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s-%s mca_oob_tcp_recv*unexpected*: tag %d size %lu\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(peer),
opal_output(0, "[%lu,%lu,%lu]-[%ld,%ld,%ld] mca_oob_tcp_recv*unexpected*: tag %d size %lu\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(peer),
tag, (unsigned long)(msg->msg_hdr.msg_size) );
}
/* if we are returning an allocated buffer - just take it from the message */
@ -117,9 +117,9 @@ int mca_oob_tcp_recv(
}
if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {
opal_output(0, "%s-%s mca_oob_tcp_recv*expected*: tag %d size %lu\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(peer),
opal_output(0, "[%lu,%lu,%lu]-[%ld,%ld,%ld] mca_oob_tcp_recv*expected*: tag %d size %lu\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(peer),
tag, (unsigned long)(size) );
}

Просмотреть файл

@ -107,9 +107,9 @@ int mca_oob_tcp_send(
}
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
opal_output(0, "%s-%s mca_oob_tcp_send: tag %d size %lu\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_send: tag %d size %lu\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
tag, (unsigned long)size );
}
@ -214,9 +214,9 @@ int mca_oob_tcp_send_nb(
}
if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_ALL) {
opal_output(0, "%s-%s mca_oob_tcp_send_nb: tag %d size %lu\n",
ORTE_NAME_PRINT(orte_process_info.my_name),
ORTE_NAME_PRINT(&(peer->peer_name)),
opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_send_nb: tag %d size %lu\n",
ORTE_NAME_ARGS(orte_process_info.my_name),
ORTE_NAME_ARGS(&(peer->peer_name)),
tag, (unsigned long)size );
}

Просмотреть файл

@ -0,0 +1,376 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/orte_constants.h"
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmgr/rmgr.h"
#include "orte/mca/pls/base/pls_private.h"
static void orte_pls_daemon_info_construct(orte_pls_daemon_info_t* ptr)
{
ptr->cell = ORTE_CELLID_INVALID;
ptr->nodename = NULL;
ptr->name = NULL;
ptr->active_job = ORTE_JOBID_INVALID;
}
/* destructor - used to free any resources held by instance */
static void orte_pls_daemon_info_destructor(orte_pls_daemon_info_t* ptr)
{
if (NULL != ptr->nodename) free(ptr->nodename);
if (NULL != ptr->name) free(ptr->name);
}
OBJ_CLASS_INSTANCE(orte_pls_daemon_info_t, /* type name */
opal_list_item_t, /* parent "class" name */
orte_pls_daemon_info_construct, /* constructor */
orte_pls_daemon_info_destructor); /* destructor */
/*
* Store the active daemons for a job
*/
int orte_pls_base_store_active_daemons(opal_list_t *daemons)
{
orte_pls_daemon_info_t *dmn;
opal_list_item_t *item;
orte_gpr_value_t **values;
char *jobid_string, *key;
int rc, i, num_daemons;
/* determine the number of daemons */
num_daemons = (int)opal_list_get_size(daemons);
if (0 == num_daemons) {
return ORTE_SUCCESS;
}
/* since each daemon gets recorded in a separate node's container,
* we need to allocate space for num_daemons value objects
*/
values = (orte_gpr_value_t**)malloc(num_daemons * sizeof(orte_gpr_value_t*));
if (NULL == values) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memset(values, 0, num_daemons*sizeof(orte_gpr_value_t*)); /* NULL the array */
/* loop through the values and the list and create all the value objects */
item = opal_list_get_first(daemons);
for (i=0; i < num_daemons; i++) {
dmn = (orte_pls_daemon_info_t*)item;
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&values[i],
ORTE_GPR_OVERWRITE,
ORTE_NODE_SEGMENT,
1, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens),
dmn->cell, dmn->nodename))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* setup the key */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, dmn->active_job))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(values[0]);
return rc;
}
asprintf(&key, "%s-%s", ORTE_NODE_BOOTPROXY_KEY, jobid_string);
free(jobid_string);
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[0]), key, ORTE_NAME, dmn->name))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
free(key);
item = opal_list_get_next(item);
}
rc = orte_gpr.put(num_daemons, values);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
CLEANUP:
for (i=0; i < num_daemons; i++) {
if (NULL != values[i]) OBJ_RELEASE(values[i]);
}
if (NULL != values) free(values);
return rc;
}
static int get_daemons(opal_list_t *daemons, orte_jobid_t job)
{
orte_gpr_value_t **values;
orte_gpr_keyval_t *kv;
orte_std_cntr_t cnt, i, j;
char* jobid_string;
char *keys[] = {
NULL, /* placeholder */
ORTE_NODE_NAME_KEY,
ORTE_CELLID_KEY,
NULL
};
orte_cellid_t *cell;
char *nodename;
orte_process_name_t *name;
orte_pls_daemon_info_t *dmn, *dmn2;
bool found_name, found_node, found_cell;
opal_list_item_t *item;
bool check_dups;
int rc;
/* check the list to see if there is anything already on it. If there is, then
* we will need to check for duplicate entries before we add something. If not,
* then this can go a lot faster
*/
if (0 < opal_list_get_size(daemons)) {
check_dups = true;
} else {
check_dups = false;
}
/* setup the key */
if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
asprintf(&keys[0], "%s-%s", ORTE_NODE_BOOTPROXY_KEY, jobid_string);
free(jobid_string);
/* query the daemon info */
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
ORTE_NODE_SEGMENT,
NULL, /* all containers */
keys,
&cnt, &values))) {
ORTE_ERROR_LOG(rc);
free(keys[0]);
return rc;
}
/* loop through the answers and construct the list */
for (i=0; i < cnt; i++) {
/* for systems such as bproc, the node segment holds containers
* for nodes that we may not have launched upon. Each container
* will send us back a value object, so we have to ensure here
* that we only create daemon objects on the list for those nodes
* that DO provide a valid object
*/
found_name = found_node = found_cell = false;
for (j=0; j < values[i]->cnt; j++) {
kv = values[i]->keyvals[j];
if (0 == strcmp(kv->key, keys[0])) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&name, kv->value, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
found_name = true;
continue;
}
if (0 == strcmp(kv->key, ORTE_NODE_NAME_KEY)) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nodename, kv->value, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
found_node = true;
continue;
}
if (0 == strcmp(kv->key, ORTE_CELLID_KEY)) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cell, kv->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
found_cell = true;
continue;
}
}
/* if we found everything, then this is a valid entry */
if (found_name && found_node && found_cell) {
/* first check if this name is ourself - if so, ignore it */
if (ORTE_EQUAL == orte_dss.compare(name, ORTE_PROC_MY_NAME, ORTE_NAME)) {
goto MOVEON;
}
if (check_dups) {
/* see if this daemon is already on the list - if so, then we don't add it */
for (item = opal_list_get_first(daemons);
item != opal_list_get_end(daemons);
item = opal_list_get_next(item)) {
dmn2 = (orte_pls_daemon_info_t*)item;
if (ORTE_EQUAL == orte_dss.compare(dmn2->name, name, ORTE_NAME)) {
/* already on list - ignore it */
goto MOVEON;
}
}
}
dmn = OBJ_NEW(orte_pls_daemon_info_t);
if (NULL == dmn) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(dmn);
goto CLEANUP;
}
dmn->cell = *cell;
if (NULL != nodename) {
dmn->nodename = strdup(nodename);
}
/* add this daemon to the list */
opal_list_append(daemons, &dmn->super);
}
MOVEON:
OBJ_RELEASE(values[i]);
}
CLEANUP:
for (i=0; i < cnt; i++) {
if (NULL != values[i]) OBJ_RELEASE(values[i]);
}
if (NULL != values) free(values);
free(keys[0]);
return rc;
}
/*
* Retrieve a list of the active daemons for a job
*/
int orte_pls_base_get_active_daemons(opal_list_t *daemons, orte_jobid_t job, opal_list_t *attrs)
{
orte_jobid_t *jobs;
orte_std_cntr_t njobs, i;
bool allocated;
int rc;
if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_DESCENDANTS)) {
/* need to include all descendants in list */
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&jobs, &njobs, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
allocated = true;
} else if (NULL != orte_rmgr.find_attribute(attrs, ORTE_NS_INCLUDE_CHILDREN)) {
/* just include the direct children of the job */
if (ORTE_SUCCESS != (rc = orte_ns.get_job_children(&jobs, &njobs, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
allocated = true;
} else {
/* just want daemons for this one job */
jobs = &job;
njobs = 1;
allocated = false;
}
/* loop through all the jobs and get their info */
for (i=0; i < njobs; i++) {
if (ORTE_SUCCESS != (rc = get_daemons(daemons, jobs[i]))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
}
CLEANUP:
if (allocated) free(jobs);
return ORTE_SUCCESS;
}
/*
* Remove a daemon from the world of active daemons
*/
int orte_pls_base_remove_daemon(orte_pls_daemon_info_t *info)
{
/* We need to do a registry
* delete function call targeting the entry
*/
return ORTE_SUCCESS;
}
/*
* Check for available daemons we can re-use
*/
int orte_pls_base_check_avail_daemons(opal_list_t *daemons,
orte_jobid_t job)
{
orte_jobid_t root, *descendants;
orte_std_cntr_t i, ndesc;
int rc;
/* check for daemons belonging to any job in this job's family.
* Since the jobs in any family must exit together, it is reasonable
* for us to reuse any daemons that were spawned by any member
* of our extended family. We can find all of our family members
* by first finding our root job, and then getting all of its
* descendants
*/
if (ORTE_SUCCESS != (rc = orte_ns.get_root_job(&root, job))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_ns.get_job_descendants(&descendants, &ndesc, root))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* loop through the descendants, adding to the daemon list as we go */
for (i=0; i < ndesc; i++) {
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, descendants[i], NULL))) {
ORTE_ERROR_LOG(rc);
free(descendants);
return rc;
}
}
free(descendants); /* all done with these */
/* now add in any persistent daemons - they are tagged as bootproxies
* for jobid = 0
*/
if (ORTE_SUCCESS != (rc = orte_pls_base_get_active_daemons(daemons, 0, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -423,6 +423,16 @@ static void orte_pls_bproc_setup_env(char *** env)
/**
* Launches the daemons
* @param cellid the cellid of the job
* @param envp a pointer to the environment to use for the daemons
* @param node_arrays an array that holds the node arrays for each app context
* @param node_array_lens an array of lengths of the node arrays
* @param num_contexts the number of application contexts
* @param num_procs the numer of processes in the job
* @param global_vpid_start the starting vpid for the user's processes
* @param jobid the jobid for the user processes
* @retval ORTE_SUCCESS
* @retval error
*/
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
@ -519,7 +529,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
free(var);
/* set up the base environment so the daemons can get their names once launched */
rc = orte_ns_nds_bproc_put(0, map->daemon_vpid_start,
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, 0, map->daemon_vpid_start,
0, num_daemons, ORTE_VPID_INVALID, 1, envp);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
@ -626,7 +636,7 @@ static int orte_pls_bproc_launch_daemons(orte_job_map_t *map, char ***envp) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto cleanup;
}
rc = orte_pls_bproc_set_node_pid(param, map->job, pids[i]);
rc = orte_pls_bproc_set_node_pid(ORTE_PROC_MY_NAME->cellid, param, map->job, pids[i]);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -764,6 +774,17 @@ orte_pls_bproc_node_failed(orte_gpr_notify_message_t *msg)
/**
* Launches the application processes
* @param cellid the cellid of the job
* @param jobid the jobid of the job
* @param map a pointer to the mapping of this application
* @param num_processes the number of processes in this job
* @param vpid_start the starting vpid for this app context
* @param global_vpid_start the starting vpid for the user's processes
* @param app_context the application context number
* @param node_array the node array for this context
* @param node_array_len the length of the node array
* @retval ORTE_SUCCESS
* @retval error
*/
/* When working in this function, ALWAYS jump to "cleanup" if
@ -887,7 +908,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
}
/* setup environment so the procs can figure out their names */
rc = orte_ns_nds_bproc_put(map->job, vpid_start, map->vpid_start,
rc = orte_ns_nds_bproc_put(ORTE_PROC_MY_NAME->cellid, map->job, vpid_start, map->vpid_start,
num_processes, i, num_cycles, &env);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
@ -943,7 +964,7 @@ static int orte_pls_bproc_launch_app(orte_job_map_t* map, int num_slots,
ORTE_ERROR_LOG(rc);
goto cleanup;
} else {
rc = orte_ns.create_process_name(&proc_name, map->job,
rc = orte_ns.create_process_name(&proc_name, ORTE_PROC_MY_NAME->cellid, map->job,
vpid_start + j*stride);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -93,7 +93,7 @@ ORTE_DECLSPEC int orte_pls_bproc_get_proc_pids(orte_jobid_t jobid, pid_t** pids,
/**
* Utility routine to get/set daemon pid
*/
ORTE_DECLSPEC int orte_pls_bproc_set_node_pid(char* node_name, orte_jobid_t jobid, pid_t pid);
ORTE_DECLSPEC int orte_pls_bproc_set_node_pid(orte_cellid_t cellid, char* node_name, orte_jobid_t jobid, pid_t pid);
ORTE_DECLSPEC int orte_pls_bproc_get_node_pids(orte_jobid_t jobid, pid_t** pids, orte_std_cntr_t* num_pids);
/* utility functions for abort communications */

Просмотреть файл

@ -188,7 +188,7 @@ cleanup:
* the daemons.
*/
int orte_pls_bproc_set_node_pid(char* node_name, orte_jobid_t jobid, pid_t pid)
int orte_pls_bproc_set_node_pid(orte_cellid_t cellid, char* node_name, orte_jobid_t jobid, pid_t pid)
{
orte_gpr_value_t *values[1];
char *jobid_string, *key;
@ -202,7 +202,7 @@ int orte_pls_bproc_set_node_pid(char* node_name, orte_jobid_t jobid, pid_t pid)
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(values[0]->tokens), &(values[0]->num_tokens), node_name))) {
if (ORTE_SUCCESS != (rc = orte_schema.get_node_tokens(&(values[0]->tokens), &(values[0]->num_tokens), cellid, node_name))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(values[0]);
return rc;

Просмотреть файл

@ -220,6 +220,7 @@ static int pls_lsf_launch_job(orte_jobid_t jobid)
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end
*/
name.cellid = ORTE_PROC_MY_NAME->cellid;
name.jobid = 0;
name.vpid = map->daemon_vpid_start;
rc = orte_ns.get_proc_name_string(&name_string, &name);

Просмотреть файл

@ -214,7 +214,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
fprintf(hfp,"%s\n",node->node_name);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, 0, vpid);
rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;

Просмотреть файл

@ -256,6 +256,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
/* tell the new daemons the base of the name list so they can compute
* their own name on the other end
*/
name.cellid = ORTE_PROC_MY_NAME->cellid;
name.jobid = 0;
name.vpid = map->daemon_vpid_start;
rc = orte_ns.get_proc_name_string(&name_string, &name);

Просмотреть файл

@ -240,6 +240,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
{
int i, fanout, rc;
int num_processes = 0;
orte_cellid_t cellid;
opal_list_item_t *node_item, *proc_item;
orte_job_map_t *map;
orte_vpid_t vpid_start, vpid_range;
@ -260,6 +261,9 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
return rc;
}
/* get the cellid */
cellid = orte_process_info.my_name->cellid;
/* create num_apps of pointers to Xpnodeset and Xpcommand */
node_sets = (Xpnodeset **) malloc(num_apps * sizeof(Xpnodeset *));
xcmd_sets = (Xpcommand **) malloc(num_apps * sizeof(Xpcommand *));
@ -294,7 +298,7 @@ orte_pls_xcpu_launch_job(orte_jobid_t jobid)
}
for (i = 0; i < num_apps; i++) {
rc = orte_ns_nds_xcpu_put(jobid, vpid_start,
rc = orte_ns_nds_xcpu_put(cellid, jobid, vpid_start,
num_processes, &map->apps[i]->env);
if (rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -33,7 +33,10 @@ int orte_ras_base_compare_node(orte_ras_node_t *value1, orte_ras_node_t *value2,
{
int test;
/** check node names */
if (value1->node_cellid > value2->node_cellid) return ORTE_VALUE1_GREATER;
if (value2->node_cellid > value1->node_cellid) return ORTE_VALUE2_GREATER;
/** same cell - check node names */
test = strcmp(value1->node_name, value2->node_name);
if (0 == test) return ORTE_EQUAL;
if (0 < test) return ORTE_VALUE2_GREATER;

Просмотреть файл

@ -47,6 +47,7 @@ int orte_ras_base_copy_node(orte_ras_node_t **dest, orte_ras_node_t *src, orte_d
if (NULL != src->node_name) (*dest)->node_name = strdup(src->node_name);
(*dest)->launch_id = src->launch_id;
if (NULL != src->node_arch) (*dest)->node_arch = strdup(src->node_arch);
(*dest)->node_cellid = src->node_cellid;
(*dest)->node_state = src->node_state;
(*dest)->node_slots = src->node_slots;
(*dest)->node_slots_inuse = src->node_slots_inuse;

Просмотреть файл

@ -65,6 +65,13 @@ int orte_ras_base_pack_node(orte_buffer_t *buffer, const void *src,
return rc;
}
/* pack the cellid */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer,
(void*)(&(nodes[i]->node_cellid)), 1, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the state */
if (ORTE_SUCCESS != (rc = orte_dss_pack_buffer(buffer,
(void*)(&(nodes[i]->node_state)), 1, ORTE_NODE_STATE))) {

Просмотреть файл

@ -48,8 +48,8 @@ int orte_ras_base_print_node(char **output, char *prefix, orte_ras_node_t *src,
asprintf(&pfx2, "%s", prefix);
}
asprintf(&tmp, "%sData for node: Name: %s\tLaunch id: %ld",
pfx2, src->node_name, (long)src->launch_id);
asprintf(&tmp, "%sData for node: cellid: %lu\tName: %s\tLaunch id: %ld",
pfx2, (unsigned long)src->node_cellid, src->node_name, (long)src->launch_id);
asprintf(&tmp2, "%s\n%s\tArch: %s\tState: %lu", tmp, pfx2,
src->node_arch, (unsigned long)src->node_state);

Просмотреть файл

@ -77,6 +77,14 @@ int orte_ras_base_unpack_node(orte_buffer_t *buffer, void *dest,
return rc;
}
/* unpack the cellid */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,
(&(nodes[i]->node_cellid)), &n, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the state */
n = 1;
if (ORTE_SUCCESS != (rc = orte_dss_unpack_buffer(buffer,

Просмотреть файл

@ -51,7 +51,7 @@ int orte_ras_base_proc_query_alloc_no_op(opal_list_t* list)
return ORTE_ERR_NOT_SUPPORTED;
}
orte_ras_node_t* orte_ras_base_node_lookup_no_op(const char* nodename)
orte_ras_node_t* orte_ras_base_node_lookup_no_op(orte_cellid_t cell, const char* nodename)
{
return NULL;
}

Просмотреть файл

@ -37,6 +37,7 @@ static void orte_ras_base_node_construct(orte_ras_node_t* node)
node->node_name = NULL;
node->launch_id = -1;
node->node_arch = NULL;
node->node_cellid = 0;
node->node_state = ORTE_NODE_STATE_UNKNOWN;
node->node_slots = 0;
node->node_slots_inuse = 0;
@ -107,10 +108,12 @@ int orte_ras_base_node_query(opal_list_t* nodes)
ORTE_NODE_SLOTS_ALLOC_KEY,
ORTE_NODE_SLOTS_MAX_KEY,
ORTE_NODE_USERNAME_KEY,
ORTE_CELLID_KEY,
NULL
};
orte_std_cntr_t i, cnt, *sptr;
orte_node_state_t *nsptr;
orte_cellid_t *cptr;
int32_t *i32;
orte_gpr_value_t** values;
int rc;
@ -214,6 +217,14 @@ int orte_ras_base_node_query(opal_list_t* nodes)
}
continue;
}
if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyval->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
continue;
}
node->node_cellid = *cptr;
continue;
}
}
opal_list_append(nodes, &node->super);
OBJ_RELEASE(value);
@ -313,6 +324,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
ORTE_NODE_SLOTS_ALLOC_KEY,
ORTE_NODE_SLOTS_MAX_KEY,
ORTE_NODE_USERNAME_KEY,
ORTE_CELLID_KEY,
NULL
};
orte_std_cntr_t i, cnt, keys_len;
@ -320,6 +332,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
char* jobid_str;
orte_std_cntr_t *sptr;
orte_node_state_t *nsptr;
orte_cellid_t *cptr;
int32_t *i32;
int rc, alloc_key_posn=5;
@ -446,7 +459,15 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
}
continue;
}
}
if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyval->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
continue;
}
node->node_cellid = *cptr;
continue;
}
}
/* check to see if any slots were reserved on this node for us
* The "get" command will return data from ALL nodes on the node
* segment. We ONLY want to include here nodes that are assigned
@ -473,7 +494,7 @@ int orte_ras_base_node_query_alloc(opal_list_t* nodes, orte_jobid_t jobid)
* Query the registry for a specific node
*/
orte_ras_node_t* orte_ras_base_node_lookup(const char* node_name)
orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t cellid, const char* node_name)
{
char* keys[] = {
ORTE_NODE_NAME_KEY,
@ -485,18 +506,20 @@ orte_ras_node_t* orte_ras_base_node_lookup(const char* node_name)
ORTE_NODE_SLOTS_ALLOC_KEY,
ORTE_NODE_SLOTS_MAX_KEY,
ORTE_NODE_USERNAME_KEY,
ORTE_CELLID_KEY,
NULL
};
orte_ras_node_t* node = NULL;
orte_std_cntr_t i, cnt, num_tokens;
orte_std_cntr_t *sptr;
orte_cellid_t *cptr;
orte_node_state_t *nsptr;
int32_t *i32;
orte_gpr_value_t** values;
char** tokens = NULL;
int rc;
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, (char*)node_name);
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, cellid, (char*)node_name);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return NULL;
@ -601,6 +624,14 @@ orte_ras_node_t* orte_ras_base_node_lookup(const char* node_name)
}
continue;
}
if(strcmp(keyval->key, ORTE_CELLID_KEY) == 0) {
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&cptr, keyval->value, ORTE_CELLID))) {
ORTE_ERROR_LOG(rc);
continue;
}
node->node_cellid = *cptr;
continue;
}
}
OBJ_RELEASE(values[i]);
break;
@ -627,6 +658,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
ORTE_NODE_LAUNCH_ID_KEY,
ORTE_NODE_ARCH_KEY,
ORTE_NODE_STATE_KEY,
ORTE_CELLID_KEY,
ORTE_NODE_SLOTS_KEY,
ORTE_NODE_SLOTS_IN_USE_KEY,
ORTE_NODE_SLOTS_MAX_KEY,
@ -637,6 +669,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
ORTE_INT32,
ORTE_STRING,
ORTE_NODE_STATE,
ORTE_CELLID,
ORTE_STD_CNTR,
ORTE_STD_CNTR,
ORTE_STD_CNTR,
@ -666,7 +699,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
for (i=0; i < num_values; i++) {
if (ORTE_SUCCESS != (rc = orte_gpr.create_value(&(values[i]),
ORTE_GPR_OVERWRITE | ORTE_GPR_TOKENS_AND,
ORTE_NODE_SEGMENT, 8, 0))) {
ORTE_NODE_SEGMENT, 9, 0))) {
ORTE_ERROR_LOG(rc);
for (j=0; j < i; j++) {
OBJ_RELEASE(values[j]);
@ -705,6 +738,12 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
goto cleanup;
}
++j;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->node_cellid)))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
++j;
if (ORTE_SUCCESS != (rc = orte_gpr.create_keyval(&(values[i]->keyvals[j]), keys[j], types[j], &(node->node_slots)))) {
ORTE_ERROR_LOG(rc);
@ -730,7 +769,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes)
}
/* setup index/keys for this node */
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_name);
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -752,11 +791,11 @@ cleanup:
int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid)
int orte_ras_base_proc_insert(opal_list_t* procs, orte_cellid_t cellid, orte_jobid_t jobid)
{
opal_list_item_t* item;
orte_gpr_value_t **values;
orte_process_name_t proc_name;
orte_process_name_t *proc_name;
int rc;
orte_std_cntr_t num_values, i, j;
char *keys[] = {
@ -803,8 +842,6 @@ int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid)
return rc;
}
}
proc_name.jobid = jobid;
for(i=0, item = opal_list_get_first(procs);
i < num_values && item != opal_list_get_end(procs);
i++, item = opal_list_get_next(item)) {
@ -829,10 +866,14 @@ int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid)
}
++j;
rc = orte_ns.create_process_name(&proc_name, cellid, jobid, i);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* setup index/keys for this node */
proc_name.vpid = (orte_vpid_t)i;
rc = orte_schema.get_proc_tokens(&(values[i]->tokens), &(values[i]->num_tokens), &proc_name);
rc = orte_schema.get_proc_tokens(&(values[i]->tokens), &(values[i]->num_tokens), proc_name);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
@ -877,7 +918,7 @@ int orte_ras_base_node_delete(opal_list_t* nodes)
node = (orte_ras_node_t*)item;
/* setup index/keys for this node */
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, node->node_name);
rc = orte_schema.get_node_tokens(&tokens, &num_tokens, node->node_cellid, node->node_name);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
@ -958,7 +999,7 @@ int orte_ras_base_node_assign(opal_list_t* nodes, orte_jobid_t jobid)
continue;
/* setup index/keys for this node */
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_name);
rc = orte_schema.get_node_tokens(&(values[i]->tokens), &(values[i]->num_tokens), node->node_cellid, node->node_name);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
free(jobid_str);

Просмотреть файл

@ -71,7 +71,7 @@ int orte_ras_base_node_query_alloc_no_op(opal_list_t*, orte_jobid_t);
int orte_ras_base_proc_query_alloc_no_op(opal_list_t* procs);
orte_ras_node_t* orte_ras_base_node_lookup_no_op(const char* nodename);
orte_ras_node_t* orte_ras_base_node_lookup_no_op(orte_cellid_t, const char* nodename);
/*
* Internal support functions
@ -92,7 +92,7 @@ ORTE_DECLSPEC int orte_ras_base_node_query(opal_list_t*);
/*
* Query the registry for a specific node
*/
ORTE_DECLSPEC orte_ras_node_t* orte_ras_base_node_lookup(const char* nodename);
ORTE_DECLSPEC orte_ras_node_t* orte_ras_base_node_lookup(orte_cellid_t, const char* nodename);
/**
* Query the registry for all nodes allocated to a specific job
@ -106,7 +106,7 @@ ORTE_DECLSPEC int orte_ras_base_proc_query_alloc(opal_list_t* procs);
*/
ORTE_DECLSPEC int orte_ras_base_node_insert(opal_list_t*);
ORTE_DECLSPEC int orte_ras_base_proc_insert(opal_list_t* procs, orte_jobid_t jobid);
ORTE_DECLSPEC int orte_ras_base_proc_insert(opal_list_t* procs, orte_cellid_t cellid, orte_jobid_t jobid);
/**
* Delete the specified nodes from the registry

Просмотреть файл

@ -149,6 +149,9 @@ static int orte_ras_dash_host_allocate(orte_jobid_t jobid, opal_list_t *attribut
node->node_name = strdup(mapped_nodes[i]);
node->node_arch = NULL;
node->node_state = ORTE_NODE_STATE_UP;
/* JMS: this should not be hard-wired to 0, but there's no
other value to put it to [yet]... */
node->node_cellid = 0;
node->node_slots_inuse = 0;
node->node_slots_max = 0;
node->node_slots = 1;

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше