diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c index 1128a48290..b1c29aecd8 100644 --- a/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -59,7 +59,7 @@ #include "orte/types.h" #include "orte/util/show_help.h" -#include "orte/mca/ess/ess.h" +#include "orte/util/proc_info.h" #include "ompi/constants.h" #include "ompi/mca/btl/btl.h" @@ -284,7 +284,7 @@ static int mca_btl_tcp_component_register(void) orte_node_rank_t node_rank; char name[256]; - node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); + node_rank = orte_process_info.my_node_rank; /* Now that we've got that local rank, take the corresponding entry from the tcp_if_seq list (wrapping diff --git a/ompi/mca/mtl/mxm/mtl_mxm.c b/ompi/mca/mtl/mxm/mtl_mxm.c index 01c79a5854..6baecb8eda 100644 --- a/ompi/mca/mtl/mxm/mtl_mxm.c +++ b/ompi/mca/mtl/mxm/mtl_mxm.c @@ -148,13 +148,13 @@ int ompi_mtl_mxm_module_init(void) } MXM_VERBOSE(1, "MXM support enabled"); - if ((lr = orte_ess.get_node_rank(ORTE_PROC_MY_NAME)) == ORTE_NODE_RANK_INVALID) { + if ((lr = orte_process_info.my_node_rank) == ORTE_NODE_RANK_INVALID) { MXM_ERROR("Unable to obtain local node rank"); return OMPI_ERROR; } for (proc = 0; proc < totps; proc++) { - if(OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&procs[proc]->proc_name))) { + if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { mxlr = max(mxlr, procs[proc]->proc_name.vpid); nlps++; } diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index b4f8204a6e..65d7d6a574 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -29,8 +29,8 @@ #include "opal/dss/dss.h" #include "opal/util/arch.h" +#include "orte/mca/db/db_types.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" @@ -140,9 +140,11 @@ int ompi_proc_complete_init(void) ompi_proc_t *proc = NULL; opal_list_item_t *item = NULL; int ret, errcode = OMPI_SUCCESS; - + opal_hwloc_locality_t *hwlocale; + uint32_t *ui32ptr; + OPAL_THREAD_LOCK(&ompi_proc_lock); - + for( item = opal_list_get_first(&ompi_proc_list); item != opal_list_get_end(&ompi_proc_list); item = opal_list_get_next(item)) { @@ -150,11 +152,21 @@ int ompi_proc_complete_init(void) if (proc->proc_name.vpid != ORTE_PROC_MY_NAME->vpid) { /* get the locality information */ - proc->proc_flags = orte_ess.proc_get_locality(&proc->proc_name); - /* get the name of the node it is on */ - proc->proc_hostname = orte_ess.proc_get_hostname(&proc->proc_name); - - ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void*)&(proc->proc_arch), OPAL_UINT32); + hwlocale = &(proc->proc_flags); + ret = ompi_modex_recv_key_value(ORTE_DB_LOCALITY, proc, (void**)&hwlocale, OPAL_HWLOC_LOCALITY_T); + if (OMPI_SUCCESS != ret) { + errcode = ret; + break; + } + /* get a pointer to the name of the node it is on */ + ret = ompi_modex_recv_string_pointer(ORTE_DB_HOSTNAME, proc, (void**)&(proc->proc_hostname), OPAL_STRING); + if (OMPI_SUCCESS != ret) { + errcode = ret; + break; + } + /* get the remote architecture */ + ui32ptr = &(proc->proc_arch); + ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void**)&ui32ptr, OPAL_UINT32); if (OMPI_SUCCESS == ret) { /* if arch is different than mine, create a new convertor for this proc */ if (proc->proc_arch != opal_local_arch) { @@ -352,6 +364,9 @@ int ompi_proc_refresh(void) { ompi_proc_t *proc = NULL; opal_list_item_t *item = NULL; orte_vpid_t i = 0; + int ret=OMPI_SUCCESS; + opal_hwloc_locality_t *hwlocale; + uint32_t *uiptr; OPAL_THREAD_LOCK(&ompi_proc_lock); @@ -372,8 +387,19 @@ int ompi_proc_refresh(void) { proc->proc_hostname = orte_process_info.nodename; proc->proc_arch = opal_local_arch; } else { - proc->proc_flags = orte_ess.proc_get_locality(&proc->proc_name); - proc->proc_hostname = orte_ess.proc_get_hostname(&proc->proc_name); + hwlocale = &(proc->proc_flags); + ret = ompi_modex_recv_key_value(ORTE_DB_LOCALITY, proc, (void**)&hwlocale, OPAL_HWLOC_LOCALITY_T); + if (OMPI_SUCCESS != ret) { + break; + } + /* get the name of the node it is on */ + ret = ompi_modex_recv_string_pointer(ORTE_DB_HOSTNAME, proc, (void*)&(proc->proc_hostname), OPAL_STRING); + if (OMPI_SUCCESS != ret) { + break; + } + /* get the remote architecture */ + uiptr = &(proc->proc_arch); + ret = ompi_modex_recv_key_value("OMPI_ARCH", proc, (void**)&uiptr, OPAL_UINT32); /* if arch is different than mine, create a new convertor for this proc */ if (proc->proc_arch != opal_local_arch) { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT @@ -394,7 +420,7 @@ int ompi_proc_refresh(void) { OPAL_THREAD_UNLOCK(&ompi_proc_lock); - return OMPI_SUCCESS; + return ret; } int diff --git a/ompi/runtime/ompi_module_exchange.c b/ompi/runtime/ompi_module_exchange.c index 9a2a251e4d..544f116f73 100644 --- a/ompi/runtime/ompi_module_exchange.c +++ b/ompi/runtime/ompi_module_exchange.c @@ -25,7 +25,7 @@ #include "opal/mca/base/base.h" #include "opal/dss/dss.h" -#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/db/db.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -34,38 +34,78 @@ #include "ompi/runtime/ompi_module_exchange.h" -int -ompi_modex_send(mca_base_component_t * source_component, - const void *data, size_t size) +int ompi_modex_send(const mca_base_component_t *source_component, + const void *data, size_t size) { int rc; - char * name = mca_base_component_to_string(source_component); - - if(NULL == name) { + char *key; + opal_byte_object_t bo; + + key = mca_base_component_to_string(source_component); + if (NULL == key) { return OMPI_ERR_OUT_OF_RESOURCE; } - - rc = orte_grpcomm.set_proc_attr(name, data, size); - free(name); + + bo.bytes = (void*)data; + bo.size = size; + + /* the store API makes a copy of the provided data */ + rc = orte_db.store(ORTE_PROC_MY_NAME, key, &bo, OPAL_BYTE_OBJECT); + free(key); return rc; } - int -ompi_modex_recv(mca_base_component_t * component, - ompi_proc_t * proc, +ompi_modex_recv(const mca_base_component_t *component, + const ompi_proc_t *proc, void **buffer, - size_t * size) + size_t *size) { int rc; - char * name = mca_base_component_to_string(component); - - if(NULL == name) { + char *key; + opal_byte_object_t *boptr; + + /* set defaults */ + *buffer = NULL; + *size = 0; + + key = mca_base_component_to_string(component); + if (NULL == key) { return OMPI_ERR_OUT_OF_RESOURCE; } - rc = orte_grpcomm.get_proc_attr(&proc->proc_name, name, buffer, size); + /* the fetch API returns a pointer to the data */ + rc = orte_db.fetch(&proc->proc_name, key, (void**)&boptr, OPAL_BYTE_OBJECT); + + if (ORTE_SUCCESS == rc) { + /* xfer the data - it was allocated in the call */ + *buffer = (void*)boptr->bytes; + *size = boptr->size; + } + + free(key); + return rc; +} + +/* return a pointer to the data, but don't create a new copy of it */ +int ompi_modex_recv_pointer(const mca_base_component_t *component, + const ompi_proc_t *proc, + void **buffer, opal_data_type_t type) +{ + int rc; + char *name = mca_base_component_to_string(component); + + /* set defaults */ + *buffer = NULL; + + if (NULL == name) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* the fetch_poointer API returns a pointer to the data */ + rc = orte_db.fetch_pointer(&proc->proc_name, name, buffer, type); free(name); + return rc; } @@ -73,16 +113,57 @@ int ompi_modex_send_string(const char* key, const void *buffer, size_t size) { - return orte_grpcomm.set_proc_attr(key, buffer, size); + int rc; + opal_byte_object_t bo; + + bo.bytes = (void*)buffer; + bo.size = size; + + /* the store API makes a copy of the provided data */ + rc = orte_db.store(ORTE_PROC_MY_NAME, key, &bo, OPAL_BYTE_OBJECT); + + return rc; } int ompi_modex_recv_string(const char* key, - struct ompi_proc_t *source_proc, + const ompi_proc_t *source_proc, void **buffer, size_t *size) { - return orte_grpcomm.get_proc_attr(&source_proc->proc_name, key, buffer, size); + int rc; + opal_byte_object_t *boptr; + + /* set defaults */ + *buffer = NULL; + *size = 0; + + /* the fetch API returns a copy of the data */ + rc = orte_db.fetch(&source_proc->proc_name, key, (void**)&boptr, OPAL_BYTE_OBJECT); + + if (ORTE_SUCCESS == rc) { + /* xfer the data for local use */ + *buffer = boptr->bytes; + *size = boptr->size; + } + + return rc; +} + +/* return a pointer to the data, but don't create a new copy of it */ +int ompi_modex_recv_string_pointer(const char* key, + const ompi_proc_t *source_proc, + void **buffer, opal_data_type_t type) +{ + int rc; + + /* set defaults */ + *buffer = NULL; + + /* the fetch_pointer API returns a pointer to the data */ + rc = orte_db.fetch_pointer(&source_proc->proc_name, key, (void**)buffer, type); + + return rc; } int @@ -91,49 +172,21 @@ ompi_modex_send_key_value(const char* key, opal_data_type_t dtype) { int rc; - opal_buffer_t buf; - opal_byte_object_t bo; - - OBJ_CONSTRUCT(&buf, opal_buffer_t); - if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, value, 1, dtype))) { - OBJ_DESTRUCT(&buf); - return rc; - } - if (OPAL_SUCCESS != (rc = opal_dss.unload(&buf, (void**)&bo.bytes, &bo.size))) { - OBJ_DESTRUCT(&buf); - return rc; - } - OBJ_DESTRUCT(&buf); - - return orte_grpcomm.set_proc_attr(key, bo.bytes, bo.size); -} - -int -ompi_modex_recv_key_value(const char* key, - struct ompi_proc_t *source_proc, - void *value, opal_data_type_t dtype) -{ - int rc; - opal_buffer_t buf; - opal_byte_object_t bo; - int32_t n; - size_t bsize; + /* the store API makes a copy of the provided data */ + rc = orte_db.store(ORTE_PROC_MY_NAME, key, value, dtype); - bo.bytes = NULL; - bo.size = 0; - if (ORTE_SUCCESS != (rc = orte_grpcomm.get_proc_attr(&source_proc->proc_name, key, - (void**)&bo.bytes, &bsize))) { - return rc; - } - bo.size = bsize; - OBJ_CONSTRUCT(&buf, opal_buffer_t); - if (OMPI_SUCCESS != (rc = opal_dss.load(&buf, bo.bytes, bo.size))) { - OBJ_DESTRUCT(&buf); - return rc; - } - n = 1; - rc = opal_dss.unpack(&buf, value, &n, dtype); - OBJ_DESTRUCT(&buf); + return rc; +} + +int ompi_modex_recv_key_value(const char* key, + const ompi_proc_t *source_proc, + void **value, opal_data_type_t type) +{ + int rc; + + /* the fetch API returns the data */ + rc = orte_db.fetch(&source_proc->proc_name, key, (void**)value, type); + return rc; } diff --git a/ompi/runtime/ompi_module_exchange.h b/ompi/runtime/ompi_module_exchange.h index d8b4185cd7..04016c3ad4 100644 --- a/ompi/runtime/ompi_module_exchange.h +++ b/ompi/runtime/ompi_module_exchange.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -58,7 +58,7 @@ #include "opal/mca/mca.h" -struct ompi_proc_t; +#include "ompi/proc/proc.h" BEGIN_C_DECLS @@ -106,7 +106,7 @@ BEGIN_C_DECLS * @retval OMPI_SUCCESS On success * @retval OMPI_ERROR An unspecified error occurred */ -OMPI_DECLSPEC int ompi_modex_send(mca_base_component_t *source_component, +OMPI_DECLSPEC int ompi_modex_send(const mca_base_component_t *source_component, const void *buffer, size_t size); @@ -187,11 +187,15 @@ OMPI_DECLSPEC int ompi_modex_send_key_value(const char* key, * @retval OMPI_ERR_OUT_OF_RESOURCE No memory could be allocated for the * buffer. */ -OMPI_DECLSPEC int ompi_modex_recv(mca_base_component_t *dest_component, - struct ompi_proc_t *source_proc, +OMPI_DECLSPEC int ompi_modex_recv(const mca_base_component_t *dest_component, + const ompi_proc_t *source_proc, void **buffer, size_t *size); +OMPI_DECLSPEC int ompi_modex_recv_pointer(const mca_base_component_t *component, + const ompi_proc_t *proc, + void **buffer, opal_data_type_t type); + /** * Receive a buffer from a given peer * @@ -221,9 +225,13 @@ OMPI_DECLSPEC int ompi_modex_recv(mca_base_component_t *dest_component, * buffer. */ OMPI_DECLSPEC int ompi_modex_recv_string(const char* key, - struct ompi_proc_t *source_proc, + const ompi_proc_t *source_proc, void **buffer, size_t *size); +OMPI_DECLSPEC int ompi_modex_recv_string_pointer(const char* key, + const ompi_proc_t *source_proc, + void **buffer, opal_data_type_t type); + /** * Recv a value from a given peer * @@ -243,11 +251,10 @@ OMPI_DECLSPEC int ompi_modex_recv_string(const char* key, * this build of Open MPI (systems like the Cray XT) */ OMPI_DECLSPEC int ompi_modex_recv_key_value(const char* key, - struct ompi_proc_t *source_proc, - void *value, + const ompi_proc_t *source_proc, + void **value, opal_data_type_t dtype); - END_C_DECLS #endif /* MCA_OMPI_MODULE_EXCHANGE_H */ diff --git a/opal/dss/dss_compare.c b/opal/dss/dss_compare.c index ebb3f2f466..3a9000c108 100644 --- a/opal/dss/dss_compare.c +++ b/opal/dss/dss_compare.c @@ -254,3 +254,9 @@ int opal_dss_compare_node_stat(opal_node_stats_t *value1, opal_node_stats_t *val { return OPAL_EQUAL; /* eventually compare field to field */ } + +/* OPAL_VALUE */ +int opal_dss_compare_value(opal_value_t *value1, opal_value_t *value2, opal_data_type_t type) +{ + return OPAL_EQUAL; /* eventually compare field to field */ +} diff --git a/opal/dss/dss_copy.c b/opal/dss/dss_copy.c index b7506c4968..3fa8a00be1 100644 --- a/opal/dss/dss_copy.c +++ b/opal/dss/dss_copy.c @@ -226,3 +226,80 @@ int opal_dss_copy_node_stat(opal_node_stats_t **dest, opal_node_stats_t *src, p->sample_time.tv_usec = src->sample_time.tv_usec; return OPAL_SUCCESS; } + +/* OPAL_VALUE */ +int opal_dss_copy_value(opal_value_t **dest, opal_value_t *src, + opal_data_type_t type) +{ + opal_value_t *p; + + /* create the new object */ + *dest = OBJ_NEW(opal_value_t); + if (NULL == *dest) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + p = *dest; + + /* copy the type and key */ + if (NULL != src->key) { + p->key = strdup(src->key); + } + p->type = src->type; + + /* copy the right field */ + switch (src->type) { + case OPAL_BYTE: + p->data.byte = src->data.byte; + break; + case OPAL_STRING: + if (NULL != src->data.string) { + p->data.string = strdup(src->data.string); + } else { + p->data.string = NULL; + } + break; + case OPAL_PID: + p->data.pid = src->data.pid; + break; + case OPAL_INT: + p->data.integer = src->data.integer; + break; + case OPAL_INT8: + p->data.int8 = src->data.int8; + break; + case OPAL_INT16: + p->data.int16 = src->data.int16; + break; + case OPAL_INT32: + p->data.int32 = src->data.int32; + break; + case OPAL_INT64: + p->data.int64 = src->data.int64; + break; + case OPAL_UINT: + p->data.uint = src->data.uint; + break; + case OPAL_UINT8: + p->data.uint8 = src->data.uint8; + break; + case OPAL_UINT16: + p->data.uint16 = src->data.uint16; + break; + case OPAL_UINT32: + p->data.uint32 = src->data.uint32; + break; + case OPAL_UINT64: + p->data.uint64 = src->data.uint64; + break; + case OPAL_BYTE_OBJECT: + p->data.bo.bytes = malloc(src->data.bo.size); + memcpy(p->data.bo.bytes, src->data.bo.bytes, src->data.bo.size); + p->data.bo.size = src->data.bo.size; + break; + default: + opal_output(0, "COPY-OPAL-VALUE: UNSUPPORTED TYPE %d", (int)src->type); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} diff --git a/opal/dss/dss_internal.h b/opal/dss/dss_internal.h index 6a8353b81c..a18466d484 100644 --- a/opal/dss/dss_internal.h +++ b/opal/dss/dss_internal.h @@ -298,6 +298,9 @@ int opal_dss_pack_pstat(opal_buffer_t *buffer, const void *src, int opal_dss_pack_node_stat(opal_buffer_t *buffer, const void *src, int32_t num_vals, opal_data_type_t type); +int opal_dss_pack_value(opal_buffer_t *buffer, const void *src, + int32_t num_vals, opal_data_type_t type); + /* * Internal unpack functions */ @@ -340,6 +343,9 @@ int opal_dss_unpack_pstat(opal_buffer_t *buffer, void *dest, int opal_dss_unpack_node_stat(opal_buffer_t *buffer, void *dest, int32_t *num_vals, opal_data_type_t type); +int opal_dss_unpack_value(opal_buffer_t *buffer, void *dest, + int32_t *num_vals, opal_data_type_t type); + /* * Internal copy functions */ @@ -359,6 +365,9 @@ int opal_dss_copy_pstat(opal_pstats_t **dest, opal_pstats_t *src, int opal_dss_copy_node_stat(opal_node_stats_t **dest, opal_node_stats_t *src, opal_data_type_t type); +int opal_dss_copy_value(opal_value_t **dest, opal_value_t *src, + opal_data_type_t type); + /* * Internal compare functions */ @@ -398,6 +407,8 @@ int opal_dss_compare_pstat(opal_pstats_t *value1, opal_pstats_t *value2, opal_da int opal_dss_compare_node_stat(opal_node_stats_t *value1, opal_node_stats_t *value2, opal_data_type_t type); +int opal_dss_compare_value(opal_value_t *value1, opal_value_t *value2, opal_data_type_t type); + /* * Internal print functions */ @@ -428,6 +439,7 @@ int opal_dss_print_data_type(char **output, char *prefix, opal_data_type_t *src, int opal_dss_print_byte_object(char **output, char *prefix, opal_byte_object_t *src, opal_data_type_t type); int opal_dss_print_pstat(char **output, char *prefix, opal_pstats_t *src, opal_data_type_t type); int opal_dss_print_node_stat(char **output, char *prefix, opal_node_stats_t *src, opal_data_type_t type); +int opal_dss_print_value(char **output, char *prefix, opal_value_t *src, opal_data_type_t type); /* diff --git a/opal/dss/dss_open_close.c b/opal/dss/dss_open_close.c index da9e4c1132..d38e34e39b 100644 --- a/opal/dss/dss_open_close.c +++ b/opal/dss/dss_open_close.c @@ -425,6 +425,16 @@ int opal_dss_open(void) "OPAL_NODE_STAT", &tmp))) { return rc; } + tmp = OPAL_VALUE; + if (OPAL_SUCCESS != (rc = opal_dss.register_type(opal_dss_pack_value, + opal_dss_unpack_value, + (opal_dss_copy_fn_t)opal_dss_copy_value, + (opal_dss_compare_fn_t)opal_dss_compare_value, + (opal_dss_print_fn_t)opal_dss_print_value, + OPAL_DSS_STRUCTURED, + "OPAL_VALUE", &tmp))) { + return rc; + } /* All done */ opal_dss_initialized = true; diff --git a/opal/dss/dss_pack.c b/opal/dss/dss_pack.c index d2ce537e84..027434c138 100644 --- a/opal/dss/dss_pack.c +++ b/opal/dss/dss_pack.c @@ -330,7 +330,7 @@ int opal_dss_pack_string(opal_buffer_t *buffer, const void *src, * OPAL_DATA_TYPE */ int opal_dss_pack_data_type(opal_buffer_t *buffer, const void *src, int32_t num_vals, - opal_data_type_t type) + opal_data_type_t type) { int ret; @@ -513,3 +513,111 @@ int opal_dss_pack_node_stat(opal_buffer_t *buffer, const void *src, return OPAL_SUCCESS; } + +/* + * OPAL_VALUE + */ +int opal_dss_pack_value(opal_buffer_t *buffer, const void *src, + int32_t num_vals, opal_data_type_t type) +{ + opal_value_t **ptr; + int32_t i, n; + int ret; + + ptr = (opal_value_t **) src; + + for (i = 0; i < num_vals; ++i) { + /* pack the key and type */ + if (OPAL_SUCCESS != (ret = opal_dss_pack_string(buffer, &ptr[i]->key, 1, OPAL_STRING))) { + return ret; + } + if (OPAL_SUCCESS != (ret = opal_dss_pack_data_type(buffer, &ptr[i]->type, 1, OPAL_DATA_TYPE))) { + return ret; + } + /* now pack the right field */ + switch (ptr[i]->type) { + case OPAL_BYTE: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.byte, 1, OPAL_BYTE))) { + return ret; + } + break; + case OPAL_STRING: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.string, 1, OPAL_STRING))) { + return ret; + } + break; + case OPAL_PID: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.pid, 1, OPAL_PID))) { + return ret; + } + break; + case OPAL_INT: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.integer, 1, OPAL_INT))) { + return ret; + } + break; + case OPAL_INT8: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.int8, 1, OPAL_INT8))) { + return ret; + } + break; + case OPAL_INT16: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.int16, 1, OPAL_INT16))) { + return ret; + } + break; + case OPAL_INT32: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.int32, 1, OPAL_INT32))) { + return ret; + } + break; + case OPAL_INT64: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.int64, 1, OPAL_INT64))) { + return ret; + } + break; + case OPAL_UINT: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.uint, 1, OPAL_UINT))) { + return ret; + } + break; + case OPAL_UINT8: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.uint8, 1, OPAL_UINT8))) { + return ret; + } + break; + case OPAL_UINT16: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.uint16, 1, OPAL_UINT16))) { + return ret; + } + break; + case OPAL_UINT32: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.uint32, 1, OPAL_UINT32))) { + return ret; + } + break; + case OPAL_UINT64: + if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->data.uint64, 1, OPAL_UINT64))) { + return ret; + } + break; + case OPAL_BYTE_OBJECT: + /* have to pack by hand so we can match unpack without allocation */ + n = ptr[i]->data.bo.size; + if (OPAL_SUCCESS != (ret = opal_dss_pack_int32(buffer, &n, 1, OPAL_INT32))) { + return ret; + } + if (0 < n) { + if (OPAL_SUCCESS != (ret = opal_dss_pack_byte(buffer, ptr[i]->data.bo.bytes, n, OPAL_BYTE))) { + return ret; + } + } + break; + default: + opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE %d", (int)ptr[i]->type); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} diff --git a/opal/dss/dss_print.c b/opal/dss/dss_print.c index bf92433087..8e04bb38b9 100644 --- a/opal/dss/dss_print.c +++ b/opal/dss/dss_print.c @@ -470,3 +470,11 @@ int opal_dss_print_node_stat(char **output, char *prefix, opal_node_stats_t *src return OPAL_SUCCESS; } + +/* + * OPAL_NODE_STAT + */ +int opal_dss_print_value(char **output, char *prefix, opal_value_t *src, opal_data_type_t type) +{ + return OPAL_SUCCESS; +} diff --git a/opal/dss/dss_types.h b/opal/dss/dss_types.h index 8ecb1f4e08..8f3a63c462 100644 --- a/opal/dss/dss_types.h +++ b/opal/dss/dss_types.h @@ -76,6 +76,7 @@ typedef struct { #define OPAL_PSTAT (opal_data_type_t) 19 /**< process statistics */ #define OPAL_NODE_STAT (opal_data_type_t) 20 /**< node statistics */ #define OPAL_HWLOC_TOPO (opal_data_type_t) 21 /**< hwloc topology */ +#define OPAL_VALUE (opal_data_type_t) 22 #define OPAL_DSS_ID_DYNAMIC (opal_data_type_t) 30 diff --git a/opal/dss/dss_unpack.c b/opal/dss/dss_unpack.c index da5ba2503a..0ab7a0f09f 100644 --- a/opal/dss/dss_unpack.c +++ b/opal/dss/dss_unpack.c @@ -650,3 +650,124 @@ int opal_dss_unpack_node_stat(opal_buffer_t *buffer, void *dest, return OPAL_SUCCESS; } + +/* + * OPAL_VALUE + */ +int opal_dss_unpack_value(opal_buffer_t *buffer, void *dest, + int32_t *num_vals, opal_data_type_t type) +{ + opal_value_t **ptr; + int32_t i, n, m; + int ret; + + ptr = (opal_value_t **) dest; + n = *num_vals; + + for (i = 0; i < n; ++i) { + /* allocate the new object */ + ptr[i] = OBJ_NEW(opal_value_t); + if (NULL == ptr[i]) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + /* unpack the key and type */ + m=1; + if (OPAL_SUCCESS != (ret = opal_dss_unpack_string(buffer, &ptr[i]->key, &m, OPAL_STRING))) { + return ret; + } + m=1; + if (OPAL_SUCCESS != (ret = opal_dss_unpack_data_type(buffer, &ptr[i]->type, &m, OPAL_DATA_TYPE))) { + return ret; + } + /* now unpack the right field */ + m=1; + switch (ptr[i]->type) { + case OPAL_BYTE: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.byte, &m, OPAL_BYTE))) { + return ret; + } + break; + case OPAL_STRING: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.string, &m, OPAL_STRING))) { + return ret; + } + break; + case OPAL_PID: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.pid, &m, OPAL_PID))) { + return ret; + } + break; + case OPAL_INT: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.integer, &m, OPAL_INT))) { + return ret; + } + break; + case OPAL_INT8: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.int8, &m, OPAL_INT8))) { + return ret; + } + break; + case OPAL_INT16: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.int16, &m, OPAL_INT16))) { + return ret; + } + break; + case OPAL_INT32: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.int32, &m, OPAL_INT32))) { + return ret; + } + break; + case OPAL_INT64: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.int64, &m, OPAL_INT64))) { + return ret; + } + break; + case OPAL_UINT: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.uint, &m, OPAL_UINT))) { + return ret; + } + break; + case OPAL_UINT8: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.uint8, &m, OPAL_UINT8))) { + return ret; + } + break; + case OPAL_UINT16: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.uint16, &m, OPAL_UINT16))) { + return ret; + } + break; + case OPAL_UINT32: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.uint32, &m, OPAL_UINT32))) { + return ret; + } + break; + case OPAL_UINT64: + if (OPAL_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, &ptr[i]->data.uint64, &m, OPAL_UINT64))) { + return ret; + } + break; + case OPAL_BYTE_OBJECT: + /* cannot use byte object unpack as it allocates memory, so unpack object size in bytes */ + if (OPAL_SUCCESS != (ret = opal_dss_unpack_int32(buffer, &(ptr[i]->data.bo.size), &m, OPAL_INT32))) { + return ret; + } + if (0 < ptr[i]->data.bo.size) { + ptr[i]->data.bo.bytes = (uint8_t*)malloc(ptr[i]->data.bo.size); + if (NULL == ptr[i]->data.bo.bytes) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + if (OPAL_SUCCESS != (ret = opal_dss_unpack_byte(buffer, ptr[i]->data.bo.bytes, + &(ptr[i]->data.bo.size), OPAL_BYTE))) { + return ret; + } + } + break; + default: + opal_output(0, "PACK-OPAL-VALUE: UNSUPPORTED TYPE"); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/hwloc/hwloc.h b/opal/mca/hwloc/hwloc.h index 016d826ab9..50a2ee65a1 100644 --- a/opal/mca/hwloc/hwloc.h +++ b/opal/mca/hwloc/hwloc.h @@ -171,6 +171,7 @@ typedef uint16_t opal_binding_policy_t; /* ******************************************************************** */ typedef uint16_t opal_hwloc_locality_t; +#define OPAL_HWLOC_LOCALITY_T OPAL_UINT16 /** Process locality definitions */ enum { diff --git a/orte/mca/db/base/Makefile.am b/orte/mca/db/base/Makefile.am index 1fb7d7495c..b05b485bdc 100644 --- a/orte/mca/db/base/Makefile.am +++ b/orte/mca/db/base/Makefile.am @@ -1,5 +1,6 @@ # # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow diff --git a/orte/mca/db/base/base.h b/orte/mca/db/base/base.h index a557922fbb..9942c6dcac 100644 --- a/orte/mca/db/base/base.h +++ b/orte/mca/db/base/base.h @@ -18,7 +18,7 @@ #include "opal/mca/mca.h" #include "opal/class/opal_list.h" -#include "opal/mca/event/event.h" +#include "opal/dss/dss.h" #include "orte/mca/db/db.h" @@ -42,19 +42,16 @@ ORTE_DECLSPEC int orte_db_base_close(void); typedef struct { int output; opal_list_t available_components; - struct timeval timeout; } orte_db_base_t; ORTE_DECLSPEC extern orte_db_base_t orte_db_base; -typedef struct { - opal_list_item_t *super; - orte_process_name_t name; - char *key; - opal_event_t *ev; - orte_db_fetch_callback_fn_t cbfunc; - void *cbdata; -} orte_db_fetch_req_t; -OBJ_CLASS_DECLARATION(orte_db_fetch_req_t); +ORTE_DECLSPEC int orte_db_base_send_modex_string(const char* key, + const void *buffer, + size_t size); + +ORTE_DECLSPEC int orte_db_base_send_modex_key_value(const char* key, + const void *value, + opal_data_type_t dtype); END_C_DECLS diff --git a/orte/mca/db/base/db_base_close.c b/orte/mca/db/base/db_base_close.c index c571ee9c68..719e397fab 100644 --- a/orte/mca/db/base/db_base_close.c +++ b/orte/mca/db/base/db_base_close.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/orte/mca/db/base/db_base_open.c b/orte/mca/db/base/db_base_open.c index 496f00bbf3..ac3ca48bc3 100644 --- a/orte/mca/db/base/db_base_open.c +++ b/orte/mca/db/base/db_base_open.c @@ -47,44 +47,3 @@ int orte_db_base_open(void) return ORTE_SUCCESS; } - -static void fetch_construct(orte_db_fetch_req_t *fetch) -{ - fetch->key = NULL; - fetch->ev = opal_event_alloc(); -} -static void fetch_destruct(orte_db_fetch_req_t *fetch) -{ - if (NULL != fetch->key) { - free(fetch->key); - } - if (NULL != fetch->ev) { - opal_event_free(fetch->ev); - } -} -OBJ_CLASS_INSTANCE(orte_db_fetch_req_t, - opal_list_item_t, - fetch_construct, - fetch_destruct); - -static void keyval_construct(orte_db_keyval_t *ptr) -{ - ptr->key = NULL; - ptr->value.bytes = NULL; - ptr->value.size = 0; -} - -static void keyval_destruct(orte_db_keyval_t *ptr) -{ - if (NULL != ptr->key) { - free(ptr->key); - } - if (NULL != ptr->value.bytes) { - free(ptr->value.bytes); - } -} -OBJ_CLASS_INSTANCE(orte_db_keyval_t, - opal_list_item_t, - keyval_construct, - keyval_destruct); - diff --git a/orte/mca/db/db.h b/orte/mca/db/db.h index cadec1c2b2..93460a91f2 100644 --- a/orte/mca/db/db.h +++ b/orte/mca/db/db.h @@ -22,6 +22,8 @@ #include "opal/mca/mca.h" #include "opal/dss/dss_types.h" +#include "orte/mca/db/db_types.h" + /** * DATABASE DESIGN * @@ -33,28 +35,6 @@ BEGIN_C_DECLS -/** - * Container for data for a particular key-value pair - */ -typedef struct orte_db_keyval_t { - /** Structure can be put on lists */ - opal_list_item_t super; - /** Key */ - char *key; - /** Byte object containing binary blob of data associated with this proc,key pair */ - opal_byte_object_t value; -} orte_db_keyval_t; -OBJ_CLASS_DECLARATION(orte_db_keyval_t); - -/* define the callback function for returning data - note that - * the memory backing the data belongs to the DB framework. The - * receiver must NOT release it - */ -typedef void (*orte_db_fetch_callback_fn_t)(orte_process_name_t *src, - char *key, - orte_db_keyval_t *data, - int num_entries); - /* * Initialize the module */ @@ -66,30 +46,52 @@ typedef int (*orte_db_base_module_init_fn_t)(void); typedef void (*orte_db_base_module_finalize_fn_t)(void); /* - * Store data in the database - overwrites if already present. The data is + * Store a copy of data in the database - overwrites if already present. The data is * copied into the database and therefore does not need to be preserved by - * the caller. Note that this is a non-blocking call - if data is stored - * offsite, the transfer will occur in the background. + * the caller. */ typedef int (*orte_db_base_module_store_fn_t)(const orte_process_name_t *proc, - const char *key, - const void *object, int32_t size); + const char *key, const void *data, opal_data_type_t type); + +/* + * Store a pointer to data in the database - data must be retained by the user. + * This allows users to share data across the code base without consuming + * additional memory, but while retaining local access + */ +typedef int (*orte_db_base_module_store_pointer_fn_t)(const orte_process_name_t *proc, + opal_value_t *kv); /* * Retrieve data * - * Retrieve the data for the given proc associated with the specified key. Wildcards - * are supported here as well. This is a non-blocking - * call - data will be returned via the callback function ONCE IT BECOMES AVAILABLE. Use - * of the "timeout" MCA parameter is encouraged to avoid hanging on fetch requests for - * "blocking" data that can never be resolved. - * - * NOTE: INTERIM IMPLEMENTATION WILL SIMPLY LOOKUP EXISTING DATA, RETURNING AN ERROR IF - * NOT ALREADY PRESENT. + * Retrieve data for the given proc associated with the specified key. Wildcards + * are supported here as well. Caller is responsible for releasing any returned + * object. */ typedef int (*orte_db_base_module_fetch_fn_t)(const orte_process_name_t *proc, const char *key, - opal_list_t *values); + void **data, opal_data_type_t type); + +/* + * Retrieve a pointer to data + * + * Retrieve a pointer to the data for the given proc associated with the specified key. Wildcards + * are supported here as well. Callers are cautioned against modifying the data as this + * will directly alter information in the database! A local copy of the data should be made + * wherever modification is possible. + */ +typedef int (*orte_db_base_module_fetch_pointer_fn_t)(const orte_process_name_t *proc, + const char *key, + void **data, opal_data_type_t type); +/* + * Retrieve multiple data elements + * + * Retrieve data for the given proc associated with the specified key. Wildcards + * are supported here as well. Caller is responsible for releasing the objects on the list. + */ +typedef int (*orte_db_base_module_fetch_multiple_fn_t)(const orte_process_name_t *proc, + const char *key, + opal_list_t *kvs); /* * Delete data @@ -100,8 +102,6 @@ typedef int (*orte_db_base_module_fetch_fn_t)(const orte_process_name_t *proc, * This function also supports wildcard values in the proc field. A NULL proc indicates * that ALL data in the database is to be purged. A WILDCARD vpid will delete all matching * keys from that jobid. Etc. - * - * Note that this is a non-blocking call - data stored off-site will be deleted asynchronously. */ typedef int (*orte_db_base_module_remove_fn_t)(const orte_process_name_t *proc, const char *key); @@ -109,11 +109,14 @@ typedef int (*orte_db_base_module_remove_fn_t)(const orte_process_name_t *proc, * the standard module data structure */ struct orte_db_base_module_1_0_0_t { - orte_db_base_module_init_fn_t init; - orte_db_base_module_finalize_fn_t finalize; - orte_db_base_module_store_fn_t store; - orte_db_base_module_fetch_fn_t fetch; - orte_db_base_module_remove_fn_t remove; + orte_db_base_module_init_fn_t init; + orte_db_base_module_finalize_fn_t finalize; + orte_db_base_module_store_fn_t store; + orte_db_base_module_store_pointer_fn_t store_pointer; + orte_db_base_module_fetch_fn_t fetch; + orte_db_base_module_fetch_pointer_fn_t fetch_pointer; + orte_db_base_module_fetch_multiple_fn_t fetch_multiple; + orte_db_base_module_remove_fn_t remove; }; typedef struct orte_db_base_module_1_0_0_t orte_db_base_module_1_0_0_t; typedef struct orte_db_base_module_1_0_0_t orte_db_base_module_t; diff --git a/orte/mca/db/db_types.h b/orte/mca/db/db_types.h new file mode 100644 index 0000000000..d45735002c --- /dev/null +++ b/orte/mca/db/db_types.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + * The OpenRTE Database Framework + * + */ + +#ifndef ORTE_DB_TYPES_H +#define ORTE_DB_TYPES_H + +#include "orte_config.h" +#include "orte/types.h" + +#include "opal/mca/mca.h" +#include "opal/dss/dss_types.h" + +BEGIN_C_DECLS + +/* define some common keys used in ORTE */ +#define ORTE_DB_HOSTNAME "orte.hostname" +#define ORTE_DB_DAEMON_VPID "orte.daemon.vpid" +#define ORTE_DB_NODERANK "orte.node.rank" +#define ORTE_DB_LOCALRANK "orte.local.rank" +#define ORTE_DB_BIND_LEVEL "orte.bind.level" +#define ORTE_DB_BIND_INDEX "orte.bind.index" +#define ORTE_DB_LOCALITY "orte.locality" +#define ORTE_DB_ARCH "orte.arch" +#define ORTE_DB_NPROCS "orte.nprocs" +#define ORTE_DB_RMLURI "orte.rmluri" + +END_C_DECLS + +#endif diff --git a/orte/mca/db/hash/db_hash.c b/orte/mca/db/hash/db_hash.c index 3b61a2e165..5694427b44 100644 --- a/orte/mca/db/hash/db_hash.c +++ b/orte/mca/db/hash/db_hash.c @@ -35,17 +35,27 @@ static int init(void); static void finalize(void); static int store(const orte_process_name_t *proc, - const char *key, const void *object, int32_t size); + const char *key, const void *object, opal_data_type_t type); +static int store_pointer(const orte_process_name_t *proc, + opal_value_t *kv); static int fetch(const orte_process_name_t *proc, - const char *key, - opal_list_t *values); + const char *key, void **data, opal_data_type_t type); +static int fetch_pointer(const orte_process_name_t *proc, + const char *key, + void **data, opal_data_type_t type); +static int fetch_multiple(const orte_process_name_t *proc, + const char *key, + opal_list_t *kvs); static int remove_data(const orte_process_name_t *proc, const char *key); orte_db_base_module_t orte_db_hash_module = { init, finalize, store, + store_pointer, fetch, + fetch_pointer, + fetch_multiple, remove_data }; @@ -135,13 +145,13 @@ static void finalize(void) * Find data for a given key in a given proc_data_t * container. */ -static orte_db_keyval_t* lookup_keyval(proc_data_t *proc_data, +static opal_value_t* lookup_keyval(proc_data_t *proc_data, const char *key) { - orte_db_keyval_t *kv = NULL; - for (kv = (orte_db_keyval_t *) opal_list_get_first(&proc_data->data); - kv != (orte_db_keyval_t *) opal_list_get_end(&proc_data->data); - kv = (orte_db_keyval_t *) opal_list_get_next(kv)) { + opal_value_t *kv = NULL; + for (kv = (opal_value_t *) opal_list_get_first(&proc_data->data); + kv != (opal_value_t *) opal_list_get_end(&proc_data->data); + kv = (opal_value_t *) opal_list_get_next(kv)) { if (0 == strcmp(key, kv->key)) { return kv; } @@ -174,17 +184,18 @@ static proc_data_t* lookup_orte_proc(opal_hash_table_t *jtable, orte_vpid_t vpid } static int store(const orte_process_name_t *proc, - const char *key, const void *object, int32_t size) + const char *key, const void *data, opal_data_type_t type) { int i; job_data_t *jtable, *jtab; proc_data_t *proc_data; - orte_db_keyval_t *kv; + opal_value_t *kv; + opal_byte_object_t *boptr; OPAL_OUTPUT_VERBOSE((5, orte_db_base.output, - "%s db:hash:store: storing key %s data size %lu for proc %s", + "%s db:hash:store: storing key %s data type %d for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - key, (unsigned long)size, ORTE_NAME_PRINT(proc))); + key, (int)type, ORTE_NAME_PRINT(proc))); /* get the job data object for this proc */ jtable = NULL; @@ -214,36 +225,321 @@ static int store(const orte_process_name_t *proc, * a pre-existing value */ if (NULL != (kv = lookup_keyval(proc_data, key))) { - /* release the old data */ - if (NULL != kv->value.bytes) { - free(kv->value.bytes); - } - } else { - kv = OBJ_NEW(orte_db_keyval_t); - kv->key = strdup(key); - opal_list_append(&proc_data->data, &kv->super); + opal_list_remove_item(&proc_data->data, &kv->super); + OBJ_RELEASE(kv); + } + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(key); + opal_list_append(&proc_data->data, &kv->super); + + /* the type could come in as an ORTE one (e.g., ORTE_VPID). Since + * the value is an OPAL definition, it cannot cover ORTE data + * types, so convert to the underlying OPAL type + */ + switch (type) { + case OPAL_STRING: + kv->type = OPAL_STRING; + kv->data.string = strdup(data); + break; + case ORTE_VPID: + case OPAL_UINT32: + kv->type = OPAL_UINT32; + kv->data.uint32 = *(uint32_t*)data; + break; + case OPAL_UINT16: + kv->type = OPAL_UINT16; + kv->data.uint16 = *(uint16_t*)(data); + break; + case OPAL_INT: + kv->type = OPAL_INT; + kv->data.integer = *(int*)(data); + break; + case OPAL_UINT: + kv->type = OPAL_UINT; + kv->data.uint = *(unsigned int*)(data); + break; + case OPAL_BYTE_OBJECT: + kv->type = OPAL_BYTE_OBJECT; + boptr = (opal_byte_object_t*)data; + kv->data.bo.bytes = malloc(boptr->size); + memcpy(kv->data.bo.bytes, boptr->bytes, boptr->size); + kv->data.bo.size = boptr->size; + break; + default: + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + return ORTE_ERR_NOT_SUPPORTED; } - kv->value.bytes = (uint8_t*)malloc(size); - memcpy(kv->value.bytes, object, size); - kv->value.size = size; return ORTE_SUCCESS; } -static int fetch(const orte_process_name_t *proc, - const char *key, - opal_list_t *values) +static int store_pointer(const orte_process_name_t *proc, + opal_value_t *kv) { int i; job_data_t *jtable, *jtab; proc_data_t *proc_data; - orte_db_keyval_t *kv, *ans; + opal_value_t *k2; + + OPAL_OUTPUT_VERBOSE((5, orte_db_base.output, + "%s db:hash:store: storing pointer of key %s for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + kv->key, ORTE_NAME_PRINT(proc))); + + /* get the job data object for this proc */ + jtable = NULL; + for (i=0; i < job_data.size; i++) { + if (NULL == (jtab = (job_data_t*)opal_pointer_array_get_item(&job_data, i))) { + continue; + } + if (jtab->jobid == proc->jobid) { + jtable = jtab; + break; + } + } + if (NULL == jtable) { + /* need to add an entry for this job */ + jtable = OBJ_NEW(job_data_t); + jtable->jobid = proc->jobid; + opal_pointer_array_add(&job_data, jtable); + } + + /* lookup the proc data object for this proc */ + if (NULL == (proc_data = lookup_orte_proc(jtable->data, proc->vpid))) { + /* unrecoverable error */ + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* see if we already have this key in the data - means we are updating + * a pre-existing value + */ + if (NULL != (k2 = lookup_keyval(proc_data, kv->key))) { + opal_list_remove_item(&proc_data->data, &k2->super); + OBJ_RELEASE(k2); + } + opal_list_append(&proc_data->data, &kv->super); + return ORTE_SUCCESS; +} + +static int fetch(const orte_process_name_t *proc, + const char *key, void **data, opal_data_type_t type) +{ + int i; + job_data_t *jtable, *jtab; + proc_data_t *proc_data; + opal_value_t *kv; + opal_byte_object_t *boptr; OPAL_OUTPUT_VERBOSE((5, orte_db_base.output, "%s db:hash:fetch: searching for key %s on proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == key) ? "NULL" : key, ORTE_NAME_PRINT(proc))); + /* if the key is NULL, that is an error */ + if (NULL == key) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + /* get the job data object for this proc */ + jtable = NULL; + for (i=0; i < job_data.size; i++) { + if (NULL == (jtab = (job_data_t*)opal_pointer_array_get_item(&job_data, i))) { + continue; + } + if (jtab->jobid == proc->jobid) { + jtable = jtab; + break; + } + } + if (NULL == jtable) { + /* eventually, we will fetch this data - but for now, this + * is simply an error + */ + return ORTE_ERR_NOT_FOUND; + } + + /* lookup the proc data object for this proc */ + if (NULL == (proc_data = lookup_orte_proc(jtable->data, proc->vpid))) { + /* unrecoverable error */ + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* find the value */ + if (NULL == (kv = lookup_keyval(proc_data, key))) { + /* again, we eventually will attempt to fetch the data - for + * now, just report it as an error */ + return ORTE_ERR_NOT_FOUND; + } + + /* do the copy and check the type */ + switch (type) { + case OPAL_STRING: + if (OPAL_STRING != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = strdup(kv->data.string); + break; + case ORTE_VPID: + case OPAL_UINT32: + if (OPAL_UINT32 != kv->type && + ORTE_VPID != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + memcpy(*data, &kv->data.uint32, 4); + break; + case OPAL_UINT16: + if (OPAL_UINT16 != kv->type && + ORTE_NODE_RANK != kv->type && + ORTE_LOCAL_RANK != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + memcpy(*data, &kv->data.uint16, 2); + break; + case OPAL_INT: + if (OPAL_INT != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + memcpy(*data, &kv->data.integer, sizeof(int)); + break; + case OPAL_UINT: + if (OPAL_UINT != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + memcpy(*data, &kv->data.uint, sizeof(unsigned int)); + break; + case OPAL_BYTE_OBJECT: + if (OPAL_BYTE_OBJECT != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + boptr = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); + boptr->bytes = malloc(kv->data.bo.size); + memcpy(boptr->bytes, kv->data.bo.bytes, kv->data.bo.size); + boptr->size = kv->data.bo.size; + *data = boptr; + break; + default: + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + return ORTE_ERR_NOT_SUPPORTED; + } + + return ORTE_SUCCESS; +} + +static int fetch_pointer(const orte_process_name_t *proc, + const char *key, + void **data, opal_data_type_t type) +{ + int i; + job_data_t *jtable, *jtab; + proc_data_t *proc_data; + opal_value_t *kv; + + OPAL_OUTPUT_VERBOSE((5, orte_db_base.output, + "%s db:hash:fetch_pointer: searching for key %s on proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == key) ? "NULL" : key, ORTE_NAME_PRINT(proc))); + + /* if the key is NULL, that is an error */ + if (NULL == key) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + + /* get the job data object for this proc */ + jtable = NULL; + for (i=0; i < job_data.size; i++) { + if (NULL == (jtab = (job_data_t*)opal_pointer_array_get_item(&job_data, i))) { + continue; + } + if (jtab->jobid == proc->jobid) { + jtable = jtab; + break; + } + } + if (NULL == jtable) { + /* eventually, we will fetch this data - but for now, this + * is simply an error + */ + return ORTE_ERR_NOT_FOUND; + } + + /* lookup the proc data object for this proc */ + if (NULL == (proc_data = lookup_orte_proc(jtable->data, proc->vpid))) { + /* unrecoverable error */ + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* find the value */ + if (NULL == (kv = lookup_keyval(proc_data, key))) { + /* again, we eventually will attempt to fetch the data - for + * now, just report it as an error */ + return ORTE_ERR_NOT_FOUND; + } + + switch (type) { + case OPAL_STRING: + if (OPAL_STRING != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = kv->data.string; + break; + case ORTE_VPID: + case OPAL_UINT32: + if (OPAL_UINT32 != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = &kv->data.uint32; + break; + case OPAL_UINT16: + if (OPAL_UINT16 != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = &kv->data.uint16; + break; + case OPAL_INT: + if (OPAL_INT != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = &kv->data.integer; + break; + case OPAL_UINT: + if (OPAL_UINT != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = &kv->data.uint; + break; + case OPAL_BYTE_OBJECT: + if (OPAL_BYTE_OBJECT != kv->type) { + return ORTE_ERR_TYPE_MISMATCH; + } + *data = &kv->data.bo; + break; + default: + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + return ORTE_ERR_NOT_SUPPORTED; + } + + return ORTE_SUCCESS; +} + +static int fetch_multiple(const orte_process_name_t *proc, + const char *key, + opal_list_t *kvs) +{ + int i; + job_data_t *jtable, *jtab; + proc_data_t *proc_data; + opal_value_t *kv, *kvnew; + int rc; + char *srchkey, *ptr; + size_t len = 0; + + OPAL_OUTPUT_VERBOSE((5, orte_db_base.output, + "%s db:hash:fetch_multiple: searching for key %s on proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == key) ? "NULL" : key, ORTE_NAME_PRINT(proc))); + /* get the job data object for this proc */ jtable = NULL; for (i=0; i < job_data.size; i++) { @@ -268,35 +564,41 @@ static int fetch(const orte_process_name_t *proc, return ORTE_ERR_OUT_OF_RESOURCE; } - /* if the key is NULL, then return all data for this proc */ + /* if the key is NULL, then return all the values */ if (NULL == key) { - for (kv = (orte_db_keyval_t *) opal_list_get_first(&proc_data->data); - kv != (orte_db_keyval_t *) opal_list_get_end(&proc_data->data); - kv = (orte_db_keyval_t *) opal_list_get_next(kv)) { - ans = OBJ_NEW(orte_db_keyval_t); - ans->key = strdup(kv->key); - ans->value.bytes = malloc(kv->value.size); - memcpy(ans->value.bytes, kv->value.bytes, kv->value.size); - ans->value.size = kv->value.size; - opal_list_append(values, &ans->super); + for (kv = (opal_value_t*) opal_list_get_first(&proc_data->data); + kv != (opal_value_t*) opal_list_get_end(&proc_data->data); + kv = (opal_value_t*) opal_list_get_next(kv)) { + if (OPAL_SUCCESS != (rc = opal_dss.copy((void**)&kvnew, kv, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(kvs, &kvnew->super); } return ORTE_SUCCESS; } - /* find the value */ - if (NULL == (kv = lookup_keyval(proc_data, key))) { - /* again, we eventually will attempt to fetch the data - for - * now, just report it as an error */ - return ORTE_ERR_NOT_FOUND; + /* see if the key includes a wildcard */ + srchkey = strdup(key); + if (NULL != (ptr = strchr(srchkey, '*'))) { + *ptr = '\0'; + len = strlen(srchkey); } - /* copy the data across */ - ans = OBJ_NEW(orte_db_keyval_t); - ans->value.bytes = (uint8_t*)malloc(kv->value.size); - memcpy(ans->value.bytes, kv->value.bytes, kv->value.size); - ans->value.size = kv->value.size; - opal_list_append(values, &ans->super); - + /* otherwise, find all matching keys and return them */ + for (kv = (opal_value_t*) opal_list_get_first(&proc_data->data); + kv != (opal_value_t*) opal_list_get_end(&proc_data->data); + kv = (opal_value_t*) opal_list_get_next(kv)) { + if ((0 < len && 0 == strncmp(srchkey, kv->key, len)) || + (0 == len && 0 == strcmp(key, kv->key))) { + if (OPAL_SUCCESS != (rc = opal_dss.copy((void**)&kvnew, kv, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(kvs, &kvnew->super); + } + } + free(srchkey); return ORTE_SUCCESS; } @@ -305,7 +607,7 @@ static int remove_data(const orte_process_name_t *proc, const char *key) int i, save_loc; job_data_t *jtable, *jtab; proc_data_t *proc_data; - orte_db_keyval_t *kv; + opal_value_t *kv; /* if proc is NULL, remove all data from the database */ if (NULL == proc) { @@ -351,7 +653,7 @@ static int remove_data(const orte_process_name_t *proc, const char *key) /* if key is NULL, remove all data for this proc */ if (NULL == key) { - while (NULL != (kv = (orte_db_keyval_t *) opal_list_remove_first(&proc_data->data))) { + while (NULL != (kv = (opal_value_t *) opal_list_remove_first(&proc_data->data))) { OBJ_RELEASE(kv); } /* remove the proc_data object itself from the jtable */ @@ -362,9 +664,9 @@ static int remove_data(const orte_process_name_t *proc, const char *key) } /* remove this item */ - for (kv = (orte_db_keyval_t*) opal_list_get_first(&proc_data->data); - kv != (orte_db_keyval_t*) opal_list_get_end(&proc_data->data); - kv = (orte_db_keyval_t*) opal_list_get_next(kv)) { + for (kv = (opal_value_t*) opal_list_get_first(&proc_data->data); + kv != (opal_value_t*) opal_list_get_end(&proc_data->data); + kv = (opal_value_t*) opal_list_get_next(kv)) { if (0 == strcmp(key, kv->key)) { OBJ_RELEASE(kv); break; diff --git a/orte/mca/ess/alps/ess_alps_component.c b/orte/mca/ess/alps/ess_alps_component.c index 7a5de64c6e..77ba0bbc18 100644 --- a/orte/mca/ess/alps/ess_alps_component.c +++ b/orte/mca/ess/alps/ess_alps_component.c @@ -42,7 +42,7 @@ orte_ess_base_component_t mca_ess_alps_component = { /* First, the mca_component_t struct containing meta information about the component itself */ { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "alps", diff --git a/orte/mca/ess/alps/ess_alps_module.c b/orte/mca/ess/alps/ess_alps_module.c index 6afaab6787..7b7cabb3c5 100644 --- a/orte/mca/ess/alps/ess_alps_module.c +++ b/orte/mca/ess/alps/ess_alps_module.c @@ -53,13 +53,6 @@ orte_ess_base_module_t orte_ess_alps_module = { rte_init, rte_finalize, orte_ess_base_app_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, NULL /* ft_event */ }; diff --git a/orte/mca/ess/base/base.h b/orte/mca/ess/base/base.h index 467e2e5a6c..8b20d4a51e 100644 --- a/orte/mca/ess/base/base.h +++ b/orte/mca/ess/base/base.h @@ -83,14 +83,6 @@ ORTE_DECLSPEC int orte_ess_base_tool_finalize(void); ORTE_DECLSPEC int orte_ess_base_orted_setup(char **hosts); ORTE_DECLSPEC int orte_ess_base_orted_finalize(void); -ORTE_DECLSPEC opal_hwloc_locality_t orte_ess_base_proc_get_locality(orte_process_name_t *proc); -ORTE_DECLSPEC orte_vpid_t orte_ess_base_proc_get_daemon(orte_process_name_t *proc); -ORTE_DECLSPEC char* orte_ess_base_proc_get_hostname(orte_process_name_t *proc); -ORTE_DECLSPEC orte_local_rank_t orte_ess_base_proc_get_local_rank(orte_process_name_t *proc); -ORTE_DECLSPEC orte_node_rank_t orte_ess_base_proc_get_node_rank(orte_process_name_t *proc); -ORTE_DECLSPEC int orte_ess_base_update_pidmap(opal_byte_object_t *bo); -ORTE_DECLSPEC int orte_ess_base_update_nidmap(opal_byte_object_t *bo); - /* Detect whether or not this proc is bound - if not, * see if it should bind itself */ diff --git a/orte/mca/ess/base/ess_base_fns.c b/orte/mca/ess/base/ess_base_fns.c index 4cea983f7f..046d790300 100644 --- a/orte/mca/ess/base/ess_base_fns.c +++ b/orte/mca/ess/base/ess_base_fns.c @@ -40,235 +40,11 @@ #include "orte/mca/ess/base/base.h" -static orte_proc_t* find_proc(orte_process_name_t *proc) /* used by daemons */ -{ - orte_job_t *jdata; - - if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return NULL; - } - - return (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); -} - -opal_hwloc_locality_t orte_ess_base_proc_get_locality(orte_process_name_t *proc) -{ - orte_pmap_t *pmap; - - if (NULL == (pmap = orte_util_lookup_pmap(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, - "%s LOOKING FOR PROC %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - return OPAL_PROC_NON_LOCAL; - } - - return pmap->locality; -} - -orte_vpid_t orte_ess_base_proc_get_daemon(orte_process_name_t *proc) -{ - orte_nid_t *nid; - orte_proc_t *pdata; - orte_vpid_t vpid; - - if (NULL == proc) { - return ORTE_VPID_INVALID; - } - - if (ORTE_JOBID_IS_DAEMON(proc->jobid)) { - return proc->vpid; - } - - if (ORTE_PROC_IS_APP) { - if (NULL == (nid = orte_util_lookup_nid(proc))) { - return ORTE_VPID_INVALID; - } - vpid = nid->daemon; - } else { - /* get the job data */ - if (NULL == (pdata = find_proc(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_VPID_INVALID; - } - - if (NULL == pdata->node || NULL == pdata->node->daemon) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_VPID_INVALID; - } - vpid = pdata->node->daemon->name.vpid; - } - - - OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, - "%s ess:base: proc %s is hosted by daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - ORTE_VPID_PRINT(vpid))); - - return vpid; -} - -char* orte_ess_base_proc_get_hostname(orte_process_name_t *proc) -{ - orte_nid_t *nid; - orte_proc_t *pdata; - char *hostname; - - if (NULL == proc) { - return NULL; - } - - if (ORTE_PROC_IS_APP) { - if (NULL == (nid = orte_util_lookup_nid(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, - "%s LOOKING FOR PROC %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - return NULL; - } - hostname = nid->name; - } else { - if (NULL == (pdata = find_proc(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return NULL; - } - hostname = pdata->node->name; - } - - OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, - "%s ess:base: proc %s is on host %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - hostname)); - - return hostname; -} - -orte_local_rank_t orte_ess_base_proc_get_local_rank(orte_process_name_t *proc) -{ - orte_pmap_t *pmap; - orte_proc_t *pdata; - orte_local_rank_t lrank; - - if (NULL == proc) { - return ORTE_LOCAL_RANK_INVALID; - } - - if (ORTE_PROC_IS_APP) { - if (NULL == (pmap = orte_util_lookup_pmap(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_LOCAL_RANK_INVALID; - } - lrank = pmap->local_rank; - } else { - if (NULL == (pdata = find_proc(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_LOCAL_RANK_INVALID; - } - lrank = pdata->local_rank; - } - - OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, - "%s ess:base: proc %s has local rank %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - (int)lrank)); - - return lrank; -} - -orte_node_rank_t orte_ess_base_proc_get_node_rank(orte_process_name_t *proc) -{ - orte_pmap_t *pmap; - orte_proc_t *pdata; - orte_node_rank_t nrank; - - if (NULL == proc) { - return ORTE_NODE_RANK_INVALID; - } - - if (ORTE_PROC_IS_APP) { - /* is this me? */ - if (proc->jobid == ORTE_PROC_MY_NAME->jobid && - proc->vpid == ORTE_PROC_MY_NAME->vpid) { - /* yes it is - reply with my rank. This is necessary - * because the pidmap will not have arrived when I - * am starting up, and if we use static ports, then - * I need to know my node rank during init - */ - return orte_process_info.my_node_rank; - } - if (NULL == (pmap = orte_util_lookup_pmap(proc))) { - return ORTE_NODE_RANK_INVALID; - } - nrank = pmap->node_rank; - } else { - if (NULL == (pdata = find_proc(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_NODE_RANK_INVALID; - } - nrank = pdata->node_rank; - } - - OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, - "%s ess:base: proc %s has node rank %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), - (int)nrank)); - - return nrank; -} - -int orte_ess_base_update_pidmap(opal_byte_object_t *bo) -{ - int ret; - - OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:base: updating pidmap", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* build the pmap */ - if (ORTE_PROC_IS_APP) { - if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo))) { - ORTE_ERROR_LOG(ret); - } - } else { - if (ORTE_SUCCESS != (ret = orte_util_decode_daemon_pidmap(bo))) { - ORTE_ERROR_LOG(ret); - } - } - - return ret; -} - -int orte_ess_base_update_nidmap(opal_byte_object_t *bo) -{ - int rc; - - /* decode the nidmap - the util will know what to do */ - if (ORTE_PROC_IS_APP) { - if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo))) { - ORTE_ERROR_LOG(rc); - } - } else { - if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(bo))) { - ORTE_ERROR_LOG(rc); - } - } - - return rc; -} - int orte_ess_base_proc_binding(void) { #if OPAL_HAVE_HWLOC hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; - orte_node_rank_t nrank; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; @@ -356,7 +132,7 @@ int orte_ess_base_proc_binding(void) /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ - if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) { + if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound @@ -371,7 +147,7 @@ int orte_ess_base_proc_binding(void) */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, - 0, nrank, OPAL_HWLOC_LOGICAL))) { + 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; @@ -383,7 +159,7 @@ int orte_ess_base_proc_binding(void) goto error; } orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL; - orte_process_info.bind_idx = nrank; + orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -392,7 +168,7 @@ int orte_ess_base_proc_binding(void) * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, - 0, nrank, OPAL_HWLOC_LOGICAL))) { + 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; @@ -404,7 +180,7 @@ int orte_ess_base_proc_binding(void) goto error; } orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL; - orte_process_info.bind_idx = nrank; + orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -413,7 +189,7 @@ int orte_ess_base_proc_binding(void) * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, - 0, nrank, OPAL_HWLOC_LOGICAL))) { + 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; diff --git a/orte/mca/ess/base/ess_base_open.c b/orte/mca/ess/base/ess_base_open.c index dfa3722493..f8da01bf18 100644 --- a/orte/mca/ess/base/ess_base_open.c +++ b/orte/mca/ess/base/ess_base_open.c @@ -41,13 +41,6 @@ orte_ess_base_module_t orte_ess = { NULL, /* init */ NULL, /* finalize */ NULL, /* abort */ - NULL, /* proc_get_locality */ - NULL, /* proc_get_daemon */ - NULL, /* proc_get_hostname */ - NULL, /* get_local_rank */ - NULL, /* get_node_rank */ - NULL, /* update_pidmap */ - NULL, /* update_nidmap */ NULL /* ft_event */ }; int orte_ess_base_output; diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index a8700bdc7c..9016611d33 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -328,11 +328,6 @@ int orte_ess_base_orted_setup(char **hosts) */ orte_routed.update_routing_plan(); - if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { - ORTE_ERROR_LOG(ret); - error = "orte_util_nidmap_init"; - goto error; - } /* extract the node info from the environment and * build a nidmap from it */ diff --git a/orte/mca/ess/env/ess_env_component.c b/orte/mca/ess/env/ess_env_component.c index 5e59b307bb..f2346e1765 100644 --- a/orte/mca/ess/env/ess_env_component.c +++ b/orte/mca/ess/env/ess_env_component.c @@ -40,7 +40,7 @@ extern orte_ess_base_module_t orte_ess_env_module; */ orte_ess_base_component_t mca_ess_env_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "env", diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index e6f2087692..08e095118f 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -87,13 +87,6 @@ orte_ess_base_module_t orte_ess_env_module = { rte_init, rte_finalize, orte_ess_base_app_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, #if OPAL_ENABLE_FT_CR == 1 rte_ft_event #else @@ -157,7 +150,7 @@ static int rte_init(void) goto error; } - /* if one was provided, build my nidmap */ + /* if data was provided, update the database */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; diff --git a/orte/mca/ess/ess.h b/orte/mca/ess/ess.h index 136472dcf0..80b17ec956 100644 --- a/orte/mca/ess/ess.h +++ b/orte/mca/ess/ess.h @@ -64,68 +64,6 @@ typedef int (*orte_ess_base_module_finalize_fn_t)(void); */ typedef void (*orte_ess_base_module_abort_fn_t)(int status, bool report); -/** - * Get the locality flag of the specified process - * - * MPI procs need to know whether a proc shares a common socket, - * board, node, computing unit, or cluster. This function provides - * a means for an MPI proc to query the locality of another proc. - */ -typedef opal_hwloc_locality_t (*orte_ess_base_module_get_proc_locality_fn_t)(orte_process_name_t *proc); - -/** - * Get the vpid of the daemon who hosts the specified proc - * - * In order to route messages to the correct place, the RML - * and routed modules need to know the vpid of the daemon - * that hosts the intended recipient. This API accesses - * the pidmap/nidmap to retrieve that info - */ -typedef orte_vpid_t (*orte_ess_base_module_proc_get_daemon_fn_t)(orte_process_name_t *proc); - -/** - * Get the hostname where a proc resides - * - * MPI procs need to know the hostname where a specified proc resides. - * Different environments provide that info in different ways - e.g., they may - * provide a callable utility to return the answer, or download - * a map of information into each process. This API provides a - * means for each environment to do the "right thing". - * - * NOTE: To avoid memory waste, this function returns a pointer - * to a static storage. IT MUST NOT BE FREED! - */ -typedef char* (*orte_ess_base_module_proc_get_hostname_fn_t)(orte_process_name_t *proc); - -/** - * Get the local rank of a remote process - */ -typedef orte_local_rank_t (*orte_ess_base_module_proc_get_local_rank_fn_t)(orte_process_name_t *proc); - -/** - * Get the node rank of a remote process - */ -typedef orte_node_rank_t (*orte_ess_base_module_proc_get_node_rank_fn_t)(orte_process_name_t *proc); - -/** - * Update the pidmap - * - * When a job is dynamically launched via comm_spawn, the pre-existing daemons need to - * update their knowledge of the process map within the job so they can properly do - * things like route messages. This API allows daemons - and anyone else who wants to - to - * add a pidmap for a new job - */ -typedef int (*orte_ess_base_module_update_pidmap_fn_t)(opal_byte_object_t *bo); - -/** - * Update a nidmap - * - * When a job is dynamically launched via comm_spawn, the pre-existing daemons need to - * update their knowledge of the node map that contains info on what daemon resides - * on which nodes - */ -typedef int (*orte_ess_base_module_update_nidmap_fn_t)(opal_byte_object_t *bo); - /** * Handle fault tolerance updates * @@ -139,21 +77,14 @@ typedef int (*orte_ess_base_module_ft_event_fn_t)(int state); /* * the standard module data structure */ -struct orte_ess_base_module_1_0_0_t { +struct orte_ess_base_module_3_0_0_t { orte_ess_base_module_init_fn_t init; orte_ess_base_module_finalize_fn_t finalize; orte_ess_base_module_abort_fn_t abort; - orte_ess_base_module_get_proc_locality_fn_t proc_get_locality; - orte_ess_base_module_proc_get_daemon_fn_t proc_get_daemon; - orte_ess_base_module_proc_get_hostname_fn_t proc_get_hostname; - orte_ess_base_module_proc_get_local_rank_fn_t get_local_rank; - orte_ess_base_module_proc_get_node_rank_fn_t get_node_rank; - orte_ess_base_module_update_pidmap_fn_t update_pidmap; - orte_ess_base_module_update_nidmap_fn_t update_nidmap; orte_ess_base_module_ft_event_fn_t ft_event; }; -typedef struct orte_ess_base_module_1_0_0_t orte_ess_base_module_1_0_0_t; -typedef struct orte_ess_base_module_1_0_0_t orte_ess_base_module_t; +typedef struct orte_ess_base_module_3_0_0_t orte_ess_base_module_3_0_0_t; +typedef struct orte_ess_base_module_3_0_0_t orte_ess_base_module_t; /* * the standard component data structure @@ -168,9 +99,9 @@ typedef struct orte_ess_base_component_2_0_0_t orte_ess_base_component_t; /* * Macro for use in components that are of type ess */ -#define ORTE_ESS_BASE_VERSION_2_0_0 \ +#define ORTE_ESS_BASE_VERSION_3_0_0 \ MCA_BASE_VERSION_2_0_0, \ - "ess", 2, 0, 0 + "ess", 3, 0, 0 /* Global structure for accessing ESS functions */ ORTE_DECLSPEC extern orte_ess_base_module_t orte_ess; /* holds selected module's function pointers */ diff --git a/orte/mca/ess/hnp/ess_hnp_component.c b/orte/mca/ess/hnp/ess_hnp_component.c index 69bc673493..f3ed1e2d9f 100644 --- a/orte/mca/ess/hnp/ess_hnp_component.c +++ b/orte/mca/ess/hnp/ess_hnp_component.c @@ -40,7 +40,7 @@ extern orte_ess_base_module_t orte_ess_hnp_module; */ orte_ess_base_component_t mca_ess_hnp_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "hnp", diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 4b7b8d0963..6d3302f089 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -92,20 +92,11 @@ static int rte_init(void); static int rte_finalize(void); static void rte_abort(int status, bool report) __opal_attribute_noreturn__; -static int update_pidmap(opal_byte_object_t *bo); -static int update_nidmap(opal_byte_object_t *bo); orte_ess_base_module_t orte_ess_hnp_module = { rte_init, rte_finalize, rte_abort, - NULL, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - update_pidmap, - update_nidmap, NULL /* ft_event */ }; @@ -742,32 +733,6 @@ static void rte_abort(int status, bool report) exit(status); } -static int update_pidmap(opal_byte_object_t *bo) -{ - /* there is nothing to do here - the HNP can resolve - * all requests directly from its internal data. However, - * we do need to free the data in the byte object to - * be consistent with other modules - */ - if (NULL != bo && NULL != bo->bytes) { - free(bo->bytes); - } - return ORTE_SUCCESS; -} - -static int update_nidmap(opal_byte_object_t *bo) -{ - /* there is nothing to do here - the HNP can resolve - * all requests directly from its internal data. However, - * we do need to free the data in the byte object to - * be consistent with other modules - */ - if (NULL != bo && NULL != bo->bytes) { - free(bo->bytes); - } - return ORTE_SUCCESS; -} - /* * Attempt to terminate the job and wait for callback indicating * the job has been aborted. diff --git a/orte/mca/ess/lsf/ess_lsf_component.c b/orte/mca/ess/lsf/ess_lsf_component.c index 1873d381b2..47a991c9b0 100644 --- a/orte/mca/ess/lsf/ess_lsf_component.c +++ b/orte/mca/ess/lsf/ess_lsf_component.c @@ -37,7 +37,7 @@ extern orte_ess_base_module_t orte_ess_lsf_module; */ orte_ess_base_component_t mca_ess_lsf_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "lsf", diff --git a/orte/mca/ess/lsf/ess_lsf_module.c b/orte/mca/ess/lsf/ess_lsf_module.c index 7d96f125b4..59a2a7829c 100644 --- a/orte/mca/ess/lsf/ess_lsf_module.c +++ b/orte/mca/ess/lsf/ess_lsf_module.c @@ -55,13 +55,6 @@ orte_ess_base_module_t orte_ess_lsf_module = { rte_init, rte_finalize, orte_ess_base_app_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, NULL /* ft_event */ }; diff --git a/orte/mca/ess/pmi/ess_pmi_component.c b/orte/mca/ess/pmi/ess_pmi_component.c index 520149413f..849a1f5bba 100644 --- a/orte/mca/ess/pmi/ess_pmi_component.c +++ b/orte/mca/ess/pmi/ess_pmi_component.c @@ -42,7 +42,7 @@ static int pmi_component_query(mca_base_module_t **module, int *priority); */ orte_ess_base_component_t mca_ess_pmi_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "pmi", diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 943d84541e..5f0ad67023 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -47,14 +47,15 @@ #include "opal/mca/hwloc/base/base.h" #include "opal/util/printf.h" +#include "orte/mca/db/db.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" -#include "orte/mca/errmgr/errmgr.h" #include "orte/util/name_fns.h" -#include "orte/runtime/orte_globals.h" #include "orte/util/nidmap.h" #include "orte/util/pre_condition_transports.h" #include "orte/util/regex.h" +#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/mca/ess/ess.h" @@ -69,13 +70,6 @@ orte_ess_base_module_t orte_ess_pmi_module = { rte_init, rte_finalize, rte_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, NULL /* ft_event */ }; @@ -93,12 +87,12 @@ static int rte_init(void) uint64_t unique_key[2]; char *cs_env, *string_key; char *pmi_id=NULL; - orte_nid_t *nid; - orte_jmap_t *jmap; - orte_pmap_t *pmap; int *ranks; char *tmp; orte_jobid_t jobid; + orte_process_name_t proc; + orte_local_rank_t local_rank; + orte_node_rank_t node_rank; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -227,42 +221,28 @@ static int rte_init(void) */ orte_process_info.app_num = 0; - /* setup the nidmap arrays - they will be filled by the modex */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_util_nidmap_init"; - goto error; - } - /* initialize our entry */ - if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { - ORTE_ERROR_LOG(ret); - error = "orte_util_setup_local_nidmap_entries"; - goto error; - } - /* correct the daemon entry on our nidmap object - note that - * each proc's nidmap will be different, but the only thing that - * matters here (since we are not routing messages) is that - * we know which procs are on the same nodes - */ - nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, 0); - nid->daemon = 0; /* setup my daemon's name - arbitrary, since we don't route * messages */ ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = 0; - /* get the job map for this job */ - jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, 0); - /* update the num procs */ - jmap->num_procs = orte_process_info.num_procs; - /* set the size of the pidmap storage so we minimize realloc's */ - if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) { + /* ensure we pick the correct critical components */ + putenv("OMPI_MCA_grpcomm=pmi"); + putenv("OMPI_MCA_routed=direct"); + + /* now use the default procedure to finish my setup */ + if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); - error = "could not set array size for pidmap"; + error = "orte_ess_base_app_setup"; goto error; } + /* store our info into the database */ + if (ORTE_SUCCESS != (ret = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_HOSTNAME, orte_process_info.nodename, OPAL_STRING))) { + error = "db store daemon vpid"; + goto error; + } /* get our local proc info to find our local rank */ if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_clique_size"); @@ -279,29 +259,26 @@ static int rte_init(void) * cycle thru the array and update the local/node * rank info */ + proc.jobid = ORTE_PROC_MY_NAME->jobid; for (j=0; j < i; j++) { - if (NULL == (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, ranks[j]))) { - /* need to create this entry */ - pmap = OBJ_NEW(orte_pmap_t); - pmap->node = nid->index; - opal_pointer_array_set_item(&jmap->pmap, ranks[j], pmap); + proc.vpid = ranks[j]; + local_rank = j; + node_rank = j; + if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) { + orte_process_info.my_local_rank = local_rank; + orte_process_info.my_node_rank = node_rank; + } + if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { + error = "db store local rank"; + goto error; + } + if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { + error = "db store node rank"; + goto error; } - pmap->local_rank = j; - pmap->node_rank = j; } free(ranks); - /* ensure we pick the correct critical components */ - putenv("OMPI_MCA_grpcomm=pmi"); - putenv("OMPI_MCA_routed=direct"); - - /* now use the default procedure to finish my setup */ - if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { - ORTE_ERROR_LOG(ret); - error = "orte_ess_base_app_setup"; - goto error; - } - /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; diff --git a/orte/mca/ess/singleton/ess_singleton_component.c b/orte/mca/ess/singleton/ess_singleton_component.c index 35d026a3e7..710b90961d 100644 --- a/orte/mca/ess/singleton/ess_singleton_component.c +++ b/orte/mca/ess/singleton/ess_singleton_component.c @@ -42,7 +42,7 @@ orte_ess_base_component_t mca_ess_singleton_component = { /* First, the mca_component_t struct containing meta information about the component itself */ { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "singleton", diff --git a/orte/mca/ess/singleton/ess_singleton_module.c b/orte/mca/ess/singleton/ess_singleton_module.c index 7a3cac963d..33b3ea9aa9 100644 --- a/orte/mca/ess/singleton/ess_singleton_module.c +++ b/orte/mca/ess/singleton/ess_singleton_module.c @@ -74,13 +74,6 @@ orte_ess_base_module_t orte_ess_singleton_module = { rte_init, rte_finalize, orte_ess_base_app_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, NULL /* ft_event */ }; diff --git a/orte/mca/ess/slurm/ess_slurm_component.c b/orte/mca/ess/slurm/ess_slurm_component.c index 7ea385179e..389cfb8cf9 100644 --- a/orte/mca/ess/slurm/ess_slurm_component.c +++ b/orte/mca/ess/slurm/ess_slurm_component.c @@ -40,7 +40,7 @@ extern orte_ess_base_module_t orte_ess_slurm_module; */ orte_ess_base_component_t mca_ess_slurm_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "slurm", diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 4960df9272..e6d6415fd6 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -58,13 +58,6 @@ orte_ess_base_module_t orte_ess_slurm_module = { rte_init, rte_finalize, orte_ess_base_app_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, NULL /* ft_event */ }; diff --git a/orte/mca/ess/tm/ess_tm_component.c b/orte/mca/ess/tm/ess_tm_component.c index 106a3365e9..9875023dcc 100644 --- a/orte/mca/ess/tm/ess_tm_component.c +++ b/orte/mca/ess/tm/ess_tm_component.c @@ -40,7 +40,7 @@ extern orte_ess_base_module_t orte_ess_tm_module; */ orte_ess_base_component_t mca_ess_tm_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "tm", diff --git a/orte/mca/ess/tm/ess_tm_module.c b/orte/mca/ess/tm/ess_tm_module.c index 21d14a81b8..b560e9b04a 100644 --- a/orte/mca/ess/tm/ess_tm_module.c +++ b/orte/mca/ess/tm/ess_tm_module.c @@ -57,13 +57,6 @@ orte_ess_base_module_t orte_ess_tm_module = { rte_init, rte_finalize, orte_ess_base_app_abort, - orte_ess_base_proc_get_locality, - orte_ess_base_proc_get_daemon, - orte_ess_base_proc_get_hostname, - orte_ess_base_proc_get_local_rank, - orte_ess_base_proc_get_node_rank, - orte_ess_base_update_pidmap, - orte_ess_base_update_nidmap, NULL /* ft_event */ }; diff --git a/orte/mca/ess/tool/ess_tool_component.c b/orte/mca/ess/tool/ess_tool_component.c index 796e6db276..6f5b93b17e 100644 --- a/orte/mca/ess/tool/ess_tool_component.c +++ b/orte/mca/ess/tool/ess_tool_component.c @@ -40,7 +40,7 @@ extern orte_ess_base_module_t orte_ess_tool_module; */ orte_ess_base_component_t mca_ess_tool_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, + ORTE_ESS_BASE_VERSION_3_0_0, /* Component name and version */ "tool", diff --git a/orte/mca/ess/tool/ess_tool_module.c b/orte/mca/ess/tool/ess_tool_module.c index f9ec730521..5949e18e76 100644 --- a/orte/mca/ess/tool/ess_tool_module.c +++ b/orte/mca/ess/tool/ess_tool_module.c @@ -43,20 +43,12 @@ static int rte_init(void); static void rte_abort(int status, bool report) __opal_attribute_noreturn__; -static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); orte_ess_base_module_t orte_ess_tool_module = { rte_init, orte_ess_base_tool_finalize, rte_abort, - NULL, /* don't need a local procs fn */ - proc_get_daemon, - NULL, /* don't need a proc_get_hostname fn */ - NULL, /* don't need a proc_get_local_rank fn */ - NULL, /* don't need a proc_get_node_rank fn */ - NULL, /* don't need to update_pidmap */ - NULL, /* don't need to update_nidmap */ NULL /* ft_event */ }; @@ -134,7 +126,3 @@ static void rte_abort(int status, bool report) exit(0); } -static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) -{ - return ORTE_VPID_INVALID; -} diff --git a/orte/mca/grpcomm/bad/grpcomm_bad_module.c b/orte/mca/grpcomm/bad/grpcomm_bad_module.c index a32bbc6ed8..f06ec5a67e 100644 --- a/orte/mca/grpcomm/bad/grpcomm_bad_module.c +++ b/orte/mca/grpcomm/bad/grpcomm_bad_module.c @@ -62,10 +62,7 @@ orte_grpcomm_base_module_t orte_grpcomm_bad_module = { xcast, bad_allgather, bad_barrier, - orte_grpcomm_base_set_proc_attr, - orte_grpcomm_base_get_proc_attr, - orte_grpcomm_base_modex, - orte_grpcomm_base_purge_proc_attrs + orte_grpcomm_base_modex }; /** diff --git a/orte/mca/grpcomm/base/base.h b/orte/mca/grpcomm/base/base.h index 4ad68e5cc6..7376340bbe 100644 --- a/orte/mca/grpcomm/base/base.h +++ b/orte/mca/grpcomm/base/base.h @@ -79,16 +79,9 @@ ORTE_DECLSPEC void orte_grpcomm_base_rollup_recv(int status, orte_process_name_t void* cbdata); /* modex support */ -ORTE_DECLSPEC int orte_grpcomm_base_set_proc_attr(const char *attr_name, - const void *data, - size_t size); -ORTE_DECLSPEC int orte_grpcomm_base_get_proc_attr(const orte_process_name_t *proc, - const char *attribute_name, void **val, - size_t *size); ORTE_DECLSPEC void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata); ORTE_DECLSPEC void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata); ORTE_DECLSPEC int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex); -ORTE_DECLSPEC int orte_grpcomm_base_purge_proc_attrs(void); ORTE_DECLSPEC int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf); ORTE_DECLSPEC int orte_grpcomm_base_update_modex_entries(orte_process_name_t *proc_name, opal_buffer_t *rbuf); diff --git a/orte/mca/grpcomm/base/grpcomm_base_modex.c b/orte/mca/grpcomm/base/grpcomm_base_modex.c index 5b536fd300..d88816558e 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_modex.c +++ b/orte/mca/grpcomm/base/grpcomm_base_modex.c @@ -65,8 +65,6 @@ orte_grpcomm_coll_id_t orte_grpcomm_base_get_coll_id(void) int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex) { int rc; - orte_local_rank_t local_rank; - orte_node_rank_t node_rank; orte_namelist_t *nm; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, @@ -113,15 +111,13 @@ int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex) } /* pack our node rank */ - node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &node_rank, 1, ORTE_NODE_RANK))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.my_node_rank, 1, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our local rank */ - local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &local_rank, 1, ORTE_LOCAL_RANK))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.my_local_rank, 1, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -167,16 +163,14 @@ int orte_grpcomm_base_modex(orte_grpcomm_collective_t *modex) void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata) { - int rc, n, cnt; + int rc, cnt; orte_process_name_t proc_name; char *hostname; orte_vpid_t daemon; orte_node_rank_t node_rank; orte_local_rank_t local_rank; - orte_nid_t *nid; - orte_jmap_t *jmap; - orte_pmap_t *pmap; orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; + opal_hwloc_locality_t locality; OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s STORING PEER MODEX DATA", @@ -185,155 +179,112 @@ void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata) /* unpack the process name */ cnt=1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &proc_name, &cnt, ORTE_NAME))) { - /* unpack the hostname */ + /* unpack and store the hostname */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &hostname, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* unpack the daemon vpid */ + /* unpack and store the daemon vpid */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &daemon, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_DAEMON_VPID, &daemon, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } - /* unpack the node rank */ + /* unpack and store the node rank */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } - + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* unpack the local rank */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } - - /* UPDATE THE NIDMAP/PIDMAP TO SUPPORT DYNAMIC OPERATIONS */ - - /* find this proc's node in the nidmap */ - nid = NULL; - for (n=0; NULL != (nid = (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, n)); n++) { - if (0 == strcmp(hostname, nid->name)) { - break; - } - } - if (NULL == nid) { - /* node wasn't found - let's add it */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex no nidmap entry for node %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == hostname) ? "NULL" : hostname)); - nid = OBJ_NEW(orte_nid_t); - nid->name = strdup(hostname); - nid->daemon = daemon; - nid->index = opal_pointer_array_add(&orte_nidmap, nid); + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; } - /* see if we have this job in a jobmap */ - if (NULL == (jmap = orte_util_lookup_jmap(proc_name.jobid))) { - /* proc wasn't found - let's add it */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex no jobmap entry for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(proc_name.jobid))); - jmap = OBJ_NEW(orte_jmap_t); - jmap->job = proc_name.jobid; - /* unfortunately, job objects cannot be stored - * by index number as the jobid is a constructed - * value. So we have to just add it to the end - * of the array - */ - opal_pointer_array_add(&orte_jobmap, jmap); - jmap->num_procs = 1; - /* have to add the pidmap entry too, but this - * can be done at the specific site corresponding - * to the proc's vpid - */ - pmap = OBJ_NEW(orte_pmap_t); - pmap->node = nid->index; - pmap->local_rank = local_rank; - pmap->node_rank = node_rank; - opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap); - } else { - /* see if we have this proc in a pidmap */ - if (NULL == (pmap = orte_util_lookup_pmap(&proc_name))) { - /* proc wasn't found - let's add it */ - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:base:full:modex no pidmap entry for proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc_name))); - pmap = OBJ_NEW(orte_pmap_t); - pmap->node = nid->index; - pmap->local_rank = local_rank; - pmap->node_rank = node_rank; - /* this can be done at the specific site corresponding - * to the proc's vpid - */ - opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap); - /* account for the proc entry in the jmap */ - jmap->num_procs++; - } - } - + /* compute the locality and store in the database */ #if OPAL_HAVE_HWLOC { opal_hwloc_level_t bind_level; unsigned int bind_idx; - /* unpack the locality info */ + /* unpack and store the locality info */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &bind_level, &cnt, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &bind_idx, &cnt, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_BIND_INDEX, &bind_idx, OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:base:modex setting proc %s level %s idx %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name), opal_hwloc_base_print_level(bind_level), bind_idx)); - /* store on the pmap */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &proc_name, ORTE_PROC_MY_NAME)) { /* if this data is from myself, then set locality to all */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:modex setting proc %s locale ALL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); - pmap->locality = OPAL_PROC_ALL_LOCAL; + locality = OPAL_PROC_ALL_LOCAL; } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) { /* this is on a different node, then mark as non-local */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:modex setting proc %s locale NONLOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); - pmap->locality = OPAL_PROC_NON_LOCAL; + locality = OPAL_PROC_NON_LOCAL; } else if (OPAL_HWLOC_NODE_LEVEL == orte_process_info.bind_level || OPAL_HWLOC_NODE_LEVEL == bind_level) { /* one or both of us is not bound, so all we can say is we are on the * same node */ - pmap->locality = OPAL_PROC_ON_NODE; + locality = OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ - pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, - orte_process_info.bind_level, - orte_process_info.bind_idx, - bind_level, bind_idx); + locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, + orte_process_info.bind_level, + orte_process_info.bind_idx, + bind_level, bind_idx); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:modex setting proc %s locale %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name), - opal_hwloc_base_print_locality(pmap->locality))); + opal_hwloc_base_print_locality(locality))); } } #else @@ -343,19 +294,23 @@ void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata) "%s grpcomm:base:modex setting proc %s locale ALL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); - pmap->locality = OPAL_PROC_ALL_LOCAL; + locality = OPAL_PROC_ALL_LOCAL; } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) { /* this is on a different node, then mark as non-local */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:modex setting proc %s locale NONLOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); - pmap->locality = OPAL_PROC_NON_LOCAL; + locality = OPAL_PROC_NON_LOCAL; } else { /* must be on our node */ - pmap->locality = OPAL_PROC_ON_NODE; + locality = OPAL_PROC_ON_NODE; } #endif + if (ORTE_SUCCESS != (rc = orte_db.store(&proc_name, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:full:modex: adding modex entry for proc %s", @@ -455,38 +410,23 @@ int orte_grpcomm_base_update_modex_entries(orte_process_name_t *proc_name, * Extract the attribute names and values */ for (j = 0; j < num_recvd_entries; j++) { - int32_t num_bytes; - void *bytes = NULL; - char *attr_name; - + opal_value_t *kv; cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &attr_name, &cnt, OPAL_STRING))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &kv, &cnt, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); goto cleanup; } - - cnt = 1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_bytes, &cnt, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - if (0 < num_bytes) { - if (NULL == (bytes = malloc(num_bytes))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - cnt = (orte_std_cntr_t)num_bytes; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, bytes, &cnt, OPAL_BYTE))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - num_bytes = cnt; + /* if this is me, dump the data - we already have it in the db */ + if (ORTE_PROC_MY_NAME->jobid == proc_name->jobid && + ORTE_PROC_MY_NAME->vpid == proc_name->vpid) { + OBJ_RELEASE(kv); + } else { /* store it in the database */ - if (ORTE_SUCCESS != (rc = orte_db.store(proc_name, attr_name, (void*)bytes, num_bytes))) { + if (ORTE_SUCCESS != (rc = orte_db.store_pointer(proc_name, kv))) { ORTE_ERROR_LOG(rc); goto cleanup; } + /* do not release the kv - the db holds that pointer */ } } @@ -494,82 +434,40 @@ int orte_grpcomm_base_update_modex_entries(orte_process_name_t *proc_name, return rc; } -int orte_grpcomm_base_set_proc_attr(const char *attr_name, - const void *data, - size_t size) -{ - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:set_proc_attr: setting attribute %s data size %lu", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - attr_name, (unsigned long)size)); - - rc = orte_db.store(ORTE_PROC_MY_NAME, attr_name, data, size); - - return rc; -} - -int orte_grpcomm_base_get_proc_attr(const orte_process_name_t *proc, - const char *attribute_name, void **val, - size_t *size) -{ - orte_db_keyval_t *kv; - opal_list_t data; - int rc; - - OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, - "%s grpcomm:get_proc_attr: searching for attr %s on proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), attribute_name, - ORTE_NAME_PRINT(proc))); - - /* set defaults */ - *val = NULL; - *size = 0; - - /* fetch the data */ - OBJ_CONSTRUCT(&data, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_db.fetch(proc, attribute_name, &data))) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } - - /* this interface to the MPI layer only supports returning one value */ - if (1 < opal_list_get_size(&data)) { - return ORTE_ERROR; - } - kv = (orte_db_keyval_t*)opal_list_remove_first(&data); - /* transfer the data */ - *val = kv->value.bytes; - *size = kv->value.size; - kv->value.bytes = NULL; - /* cleanup */ - OBJ_RELEASE(kv); - - cleanup: - OBJ_DESTRUCT(&data); - return rc; -} - -int orte_grpcomm_base_purge_proc_attrs(void) -{ - return orte_db.remove(NULL, NULL); -} - int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf) { int rc; int32_t num_entries; - orte_db_keyval_t *kv; + opal_value_t *kv; opal_list_t data; + opal_list_item_t *item, *next; /* fetch our data */ OBJ_CONSTRUCT(&data, opal_list_t); - if (ORTE_SUCCESS != (rc = orte_db.fetch(ORTE_PROC_MY_NAME, NULL, &data))) { + if (ORTE_SUCCESS != (rc = orte_db.fetch_multiple(ORTE_PROC_MY_NAME, NULL, &data))) { ORTE_ERROR_LOG(rc); goto cleanup; } - num_entries = opal_list_get_size(&data); + + /* count the number of entries we will send, purging the rest */ + num_entries = 0; + item = opal_list_get_first(&data); + while (item != opal_list_get_end(&data)) { + kv = (opal_value_t*)item; + next = opal_list_get_next(item); + /* if this is an entry we get from the nidmap, then don't include it here */ + if (0 == strcmp(kv->key, ORTE_DB_HOSTNAME) || + 0 == strcmp(kv->key, ORTE_DB_DAEMON_VPID) || + 0 == strcmp(kv->key, ORTE_DB_NODERANK) || + 0 == strcmp(kv->key, ORTE_DB_LOCALRANK) || + 0 == strcmp(kv->key, ORTE_DB_BIND_LEVEL) || + 0 == strcmp(kv->key, ORTE_DB_BIND_INDEX)) { + opal_list_remove_item(&data, item); + } else { + num_entries++; + } + item = next; + } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:pack_modex: reporting %d entries", @@ -582,26 +480,16 @@ int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf) } /* if there are entries, store them */ - while (NULL != (kv = (orte_db_keyval_t*)opal_list_remove_first(&data))) { - if (ORTE_SUCCESS != (opal_dss.pack(buf, &kv->key, 1, OPAL_STRING))) { + while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) { + if (ORTE_SUCCESS != (opal_dss.pack(buf, &kv, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); break; } - if (ORTE_SUCCESS != (opal_dss.pack(buf, &(kv->value.size), 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - break; - } - if (0 < kv->value.size) { - if (ORTE_SUCCESS != (opal_dss.pack(buf, kv->value.bytes, kv->value.size, OPAL_BYTE))) { - ORTE_ERROR_LOG(rc); - break; - } - } OBJ_RELEASE(kv); } cleanup: - while (NULL != (kv = (orte_db_keyval_t*)opal_list_remove_first(&data))) { + while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) { OBJ_RELEASE(kv); } OBJ_DESTRUCT(&data); diff --git a/orte/mca/grpcomm/base/grpcomm_base_receive.c b/orte/mca/grpcomm/base/grpcomm_base_receive.c index b1fd8ecb55..6bda5c3e9d 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_receive.c +++ b/orte/mca/grpcomm/base/grpcomm_base_receive.c @@ -33,7 +33,6 @@ #include "orte/util/proc_info.h" #include "orte/util/error_strings.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" #include "orte/mca/odls/base/base.h" #include "orte/mca/rml/rml.h" #include "orte/mca/routed/routed.h" @@ -374,7 +373,8 @@ void orte_grpcomm_base_progress_collectives(void) orte_grpcomm_collective_t *coll; orte_namelist_t *nm; orte_job_t *jdata; - orte_vpid_t nlp, vpid; + orte_proc_t *proc; + orte_vpid_t nlp; opal_buffer_t *relay; int rc; @@ -427,14 +427,28 @@ void orte_grpcomm_base_progress_collectives(void) nlp += jdata->num_local_procs; } else { /* see if this is a local proc */ - if (ORTE_VPID_INVALID == (vpid = orte_ess.proc_get_daemon(&nm->name))) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d PROC %s NOT FOUND", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id, ORTE_NAME_PRINT(&nm->name))); + goto next_coll; + } + if (NULL == proc->node || NULL == proc->node->daemon) { + OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, + "%s COLL %d NODE OR DAEMON NOT FOUND FOR PROC %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + coll->id, ORTE_NAME_PRINT(&nm->name))); + goto next_coll; + } + if (ORTE_VPID_INVALID == proc->node->daemon->name.vpid) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s COLL %d VPID %s NONLOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id, ORTE_VPID_PRINT(nm->name.vpid))); continue; } - if (vpid == ORTE_PROC_MY_NAME->vpid) { + if (proc->node->daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:prog:collectives Counting %s as local participant", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), diff --git a/orte/mca/grpcomm/base/grpcomm_base_xcast.c b/orte/mca/grpcomm/base/grpcomm_base_xcast.c index bfd7e61409..4428bd3616 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_xcast.c +++ b/orte/mca/grpcomm/base/grpcomm_base_xcast.c @@ -33,11 +33,11 @@ #include "orte/util/proc_info.h" #include "orte/util/error_strings.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" #include "orte/mca/odls/base/base.h" #include "orte/mca/rml/rml.h" #include "orte/mca/routed/routed.h" #include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/grpcomm/grpcomm_types.h" @@ -89,7 +89,7 @@ void orte_grpcomm_base_xcast_recv(int status, orte_process_name_t* sender, "%s grpcomm:base:xcast updating daemon nidmap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) { + if (ORTE_SUCCESS != (ret = orte_util_decode_daemon_nodemap(bo))) { ORTE_ERROR_LOG(ret); goto relay; } diff --git a/orte/mca/grpcomm/grpcomm.h b/orte/mca/grpcomm/grpcomm.h index a3aae1424b..b5872abf42 100644 --- a/orte/mca/grpcomm/grpcomm.h +++ b/orte/mca/grpcomm/grpcomm.h @@ -74,21 +74,9 @@ typedef int (*orte_grpcomm_base_module_barrier_fn_t)(orte_grpcomm_collective_t * * OF HOW THIS ALL WORKS */ -/* send an attribute buffer */ -typedef int (*orte_grpcomm_base_module_modex_set_proc_attr_fn_t)(const char* attr_name, - const void *buffer, size_t size); - -/* get an attribute buffer */ -typedef int (*orte_grpcomm_base_module_modex_get_proc_attr_fn_t)(const orte_process_name_t *name, - const char* attr_name, - void **buffer, size_t *size); - /* perform a modex operation */ typedef int (*orte_grpcomm_base_module_modex_fn_t)(orte_grpcomm_collective_t *coll); -/* purge the internal attr table */ -typedef int (*orte_grpcomm_base_module_purge_proc_attrs_fn_t)(void); - /* * Ver 2.0 */ @@ -99,11 +87,7 @@ struct orte_grpcomm_base_module_2_0_0_t { orte_grpcomm_base_module_xcast_fn_t xcast; orte_grpcomm_base_module_allgather_fn_t allgather; orte_grpcomm_base_module_barrier_fn_t barrier; - /* modex functions */ - orte_grpcomm_base_module_modex_set_proc_attr_fn_t set_proc_attr; - orte_grpcomm_base_module_modex_get_proc_attr_fn_t get_proc_attr; orte_grpcomm_base_module_modex_fn_t modex; - orte_grpcomm_base_module_purge_proc_attrs_fn_t purge_proc_attrs; }; typedef struct orte_grpcomm_base_module_2_0_0_t orte_grpcomm_base_module_2_0_0_t; diff --git a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c index 3273818570..56e23e3e38 100644 --- a/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c +++ b/orte/mca/grpcomm/pmi/grpcomm_pmi_module.c @@ -25,6 +25,7 @@ #include "opal/dss/dss.h" #include "opal/mca/hwloc/base/base.h" +#include "orte/mca/db/db.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" #include "orte/util/name_fns.h" @@ -42,13 +43,7 @@ static int xcast(orte_jobid_t job, orte_rml_tag_t tag); static int pmi_allgather(orte_grpcomm_collective_t *coll); static int pmi_barrier(orte_grpcomm_collective_t *coll); -static int pmi_set_proc_attr(const char* attr_name, - const void *buffer, size_t size); -static int pmi_get_proc_attr(const orte_process_name_t name, - const char* attr_name, - void **buffer, size_t *size); static int modex(orte_grpcomm_collective_t *coll); -static int purge_proc_attrs(void); /* Module def */ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = { @@ -57,10 +52,7 @@ orte_grpcomm_base_module_t orte_grpcomm_pmi_module = { xcast, pmi_allgather, pmi_barrier, - pmi_set_proc_attr, - pmi_get_proc_attr, - modex, - purge_proc_attrs + modex }; static int pmi_encode(const void *val, size_t vallen); @@ -345,67 +337,102 @@ static int pmi_get_proc_attr(const orte_process_name_t name, /*** MODEX SECTION ***/ static int modex(orte_grpcomm_collective_t *coll) { - int rc, i; + int rc; size_t len; char *rml_uri; orte_vpid_t v; orte_process_name_t name; - orte_jmap_t *jmap; - orte_nid_t *nid, *loc; - orte_pmap_t *pmap; void *tmp_val; + orte_node_rank_t node_rank; + orte_local_rank_t local_rank; + opal_list_t modex_data; + opal_value_t *kv; + uint32_t arch; + uint8_t th_level; + opal_byte_object_t bo; + char *hostname; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:pmi: modex entered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* provide our hostname so others can know our location */ + /* check size of our hostname */ if (strlen(orte_process_info.nodename) > (size_t)pmi_vallen_max) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); return ORTE_ERR_VALUE_OUT_OF_BOUNDS; } - rc = pmi_set_proc_attr ("HOSTNAME", orte_process_info.nodename, strlen(orte_process_info.nodename)); - if (ORTE_SUCCESS != rc) { - return rc; - } - /* add our oob endpoint info so that oob communications * can be supported */ rml_uri = orte_rml.get_contact_info(); - rc = pmi_set_proc_attr ("RMLURI", rml_uri, strlen (rml_uri)); + rc = pmi_set_proc_attr (ORTE_DB_RMLURI, rml_uri, strlen (rml_uri)); if (ORTE_SUCCESS != rc) { return rc; } free(rml_uri); #if OPAL_HAVE_HWLOC - rc = pmi_set_proc_attr ("BIND_LEVEL", &orte_process_info.bind_level, sizeof (orte_process_info.bind_level)); + rc = pmi_set_proc_attr (ORTE_DB_BIND_LEVEL, &orte_process_info.bind_level, sizeof (orte_process_info.bind_level)); if (ORTE_SUCCESS != rc) { return rc; } - rc = pmi_set_proc_attr ("BIND_IDX", &orte_process_info.bind_idx, sizeof (orte_process_info.bind_idx)); + rc = pmi_set_proc_attr (ORTE_DB_BIND_INDEX, &orte_process_info.bind_idx, sizeof (orte_process_info.bind_idx)); if (ORTE_SUCCESS != rc) { return rc; } #endif - /* get the job map for this job */ - jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, 0); - /* get my pidmap entry */ - pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid); - - rc = pmi_set_proc_attr ("LOCALRANK", &pmap->local_rank, sizeof (pmap->local_rank)); - if (ORTE_SUCCESS != rc) { - return rc; + /* fetch all of my connection info from the database and push it to PMI - includes + * my hostname, daemon vpid, local rank, and node rank + */ + OBJ_CONSTRUCT(&modex_data, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_db.fetch_multiple(ORTE_PROC_MY_NAME, NULL, &modex_data))) { + ORTE_ERROR_LOG(rc); + return rc; } - rc = pmi_set_proc_attr ("NODERANK", &pmap->node_rank, sizeof (pmap->node_rank)); - if (ORTE_SUCCESS != rc) { - return rc; + while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&modex_data))) { + switch (kv->type) { + case OPAL_STRING: + if (ORTE_SUCCESS != (rc = pmi_set_proc_attr(kv->key, kv->data.string, strlen(kv->data.string)))) { + ORTE_ERROR_LOG(rc); + return rc; + } + break; + case OPAL_INT: + if (ORTE_SUCCESS != (rc = pmi_set_proc_attr(kv->key, &kv->data.integer, sizeof(int)))) { + ORTE_ERROR_LOG(rc); + return rc; + } + break; + case ORTE_VPID: + case OPAL_UINT32: + if (ORTE_SUCCESS != (rc = pmi_set_proc_attr(kv->key, &kv->data.uint32, sizeof(uint32_t)))) { + ORTE_ERROR_LOG(rc); + return rc; + } + break; + case OPAL_UINT16: + if (ORTE_SUCCESS != (rc = pmi_set_proc_attr(kv->key, &kv->data.uint16, sizeof(uint16_t)))) { + ORTE_ERROR_LOG(rc); + return rc; + } + break; + case OPAL_BYTE_OBJECT: + if (ORTE_SUCCESS != (rc = pmi_set_proc_attr(kv->key, kv->data.bo.bytes, kv->data.bo.size))) { + ORTE_ERROR_LOG(rc); + return rc; + } + break; + default: + ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED); + return ORTE_ERR_NOT_SUPPORTED; + } + OBJ_RELEASE(kv); } + OBJ_DESTRUCT(&modex_data); rc = pmi_put_last_key (); if (ORTE_SUCCESS != rc) { @@ -431,7 +458,7 @@ static int modex(orte_grpcomm_collective_t *coll) name.vpid = v; - rc = pmi_get_proc_attr (name, "RMLURI", (void **) &rml_uri, &len); + rc = pmi_get_proc_attr (name, ORTE_DB_RMLURI, (void **) &rml_uri, &len); if (ORTE_SUCCESS != rc) { return rc; } @@ -447,7 +474,7 @@ static int modex(orte_grpcomm_collective_t *coll) } free(rml_uri); - rc = pmi_get_proc_attr (name, "HOSTNAME", &tmp_val, &len); + rc = pmi_get_proc_attr (name, ORTE_DB_HOSTNAME, (void**)&hostname, &len); if (ORTE_SUCCESS != rc) { return rc; } @@ -455,110 +482,151 @@ static int modex(orte_grpcomm_collective_t *coll) OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi: proc %s location %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&name), (char *) tmp_val)); + ORTE_NAME_PRINT(&name), hostname)); - /* see if this node is already in nidmap */ - for (i = 0, loc = NULL; i < orte_nidmap.size; i++) { - if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - continue; - } - if (0 == strcmp(tmp_val, nid->name)) { - /* found it */ - loc = nid; - free (tmp_val); - break; - } - } - if (NULL == loc) { - /* new node - save it */ - loc = OBJ_NEW(orte_nid_t); - loc->name = tmp_val; - loc->index = opal_pointer_array_add(&orte_nidmap, loc); - loc->daemon = loc->index; - /* keep track */ - orte_process_info.num_nodes++; - } - - /* see if this proc is already in the pidmap */ - if (NULL == (pmap = opal_pointer_array_get_item(&jmap->pmap, v))) { - /* nope - add it */ - pmap = OBJ_NEW(orte_pmap_t); - pmap->node = loc->index; - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, v, pmap))) { - ORTE_ERROR_LOG(rc); - return rc; - } + /* store it */ + if (ORTE_SUCCESS != (rc = orte_db.store(&name, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; } #if OPAL_HAVE_HWLOC { opal_hwloc_level_t bind_level; unsigned int bind_idx; + opal_hwloc_locality_t locality; /* get the proc's locality info, if available */ - pmi_get_proc_attr (name, "BIND_LEVEL", &tmp_val, &len); + pmi_get_proc_attr (name, ORTE_DB_BIND_LEVEL, &tmp_val, &len); if (ORTE_SUCCESS == rc && 0 < len) { assert (len == sizeof (bind_level)); memmove (&bind_level, tmp_val, len); free (tmp_val); } + if (ORTE_SUCCESS != (rc = orte_db.store(&name, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) { + ORTE_ERROR_LOG(rc); + return rc; + } - rc = pmi_get_proc_attr (name, "BIND_IDX", &tmp_val, &len); + rc = pmi_get_proc_attr (name, ORTE_DB_BIND_INDEX, &tmp_val, &len); if (ORTE_SUCCESS == rc && 0 < len) { assert (len == sizeof (bind_idx)); memmove (&bind_idx, tmp_val, len); free (tmp_val); } + if (ORTE_SUCCESS != (rc = orte_db.store(&name, ORTE_DB_BIND_INDEX, &bind_idx, OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + return rc; + } if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) { /* if this data is from myself, then set locality to all */ - pmap->locality = OPAL_PROC_ALL_LOCAL; - } else if (loc->daemon != ORTE_PROC_MY_DAEMON->vpid) { + locality = OPAL_PROC_ALL_LOCAL; + } else if (0 != strcmp(hostname, orte_process_info.nodename)) { /* this is on a different node, then mark as non-local */ - pmap->locality = OPAL_PROC_NON_LOCAL; + locality = OPAL_PROC_NON_LOCAL; } else if (0 == len) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ - pmap->locality = OPAL_PROC_ON_NODE; + locality = OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ - pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, - orte_process_info.bind_level, - orte_process_info.bind_idx, - bind_level, bind_idx); + locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, + orte_process_info.bind_level, + orte_process_info.bind_idx, + bind_level, bind_idx); } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:pmi setting proc %s locale %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), - opal_hwloc_base_print_locality(pmap->locality))); + opal_hwloc_base_print_locality(locality))); + if (ORTE_SUCCESS != (rc = orte_db.store(&name, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { + ORTE_ERROR_LOG(rc); + return rc; + } } #endif /* get the proc's local/node rank info */ - rc = pmi_get_proc_attr (name, "LOCALRANK", &tmp_val, &len); + rc = pmi_get_proc_attr (name, ORTE_DB_LOCALRANK, &tmp_val, &len); if (ORTE_SUCCESS != rc) { return rc; } - assert (len == sizeof (pmap->local_rank)); - memmove (&pmap->local_rank, tmp_val, len); + assert (len == sizeof (local_rank)); + memmove (&local_rank, tmp_val, len); free (tmp_val); + if (ORTE_SUCCESS != (rc = orte_db.store(&name, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { + ORTE_ERROR_LOG(rc); + return rc; + } - rc = pmi_get_proc_attr (name, "NODERANK", &tmp_val, &len); + rc = pmi_get_proc_attr (name, ORTE_DB_NODERANK, &tmp_val, &len); if (ORTE_SUCCESS != rc) { return rc; } - assert (len == sizeof (pmap->node_rank)); - memmove (&pmap->node_rank, tmp_val, len); + assert (len == sizeof (node_rank)); + memmove (&node_rank, tmp_val, len); free (tmp_val); + if (ORTE_SUCCESS != (rc = orte_db.store(&name, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + return rc; + } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output, "%s grpcomm:pmi: proc %s lrank %u nrank %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), - (unsigned int)pmap->local_rank, - (unsigned int)pmap->node_rank)); + (unsigned int)local_rank, + (unsigned int)node_rank)); + + /* have to get two other items that are for the MPI layer - these + * need to be stored in a particular way to match how they will + * be retrieved + */ + rc = pmi_get_proc_attr (name, "OMPI_ARCH", &tmp_val, &len); + if (ORTE_SUCCESS != rc) { + return rc; + } + assert (len == sizeof (uint32_t)); + memmove (&arch, tmp_val, len); + free (tmp_val); + if (ORTE_SUCCESS != (rc = orte_db.store(&name, "OMPI_ARCH", &arch, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = pmi_get_proc_attr (name, "MPI_THREAD_LEVEL", &tmp_val, &len); + if (ORTE_SUCCESS != rc) { + return rc; + } + assert (len == sizeof (uint8_t)); + memmove (&th_level, tmp_val, len); + free (tmp_val); + bo.bytes = &th_level; + bo.size = 1; + if (ORTE_SUCCESS != (rc = orte_db.store(&name, "MPI_THREAD_LEVEL", (void*)&bo, OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* harvest all btl info that we know about and store it */ + OBJ_CONSTRUCT(&modex_data, opal_list_t); + if (ORTE_SUCCESS != (rc = orte_db.fetch_multiple(ORTE_PROC_MY_NAME, "btl.*", &modex_data))) { + ORTE_ERROR_LOG(rc); + return rc; + } + while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&modex_data))) { + if (ORTE_SUCCESS != (rc = pmi_get_proc_attr(name, kv->key, &tmp_val, &len))) { + return rc; + } + if (ORTE_SUCCESS != (rc = orte_db.store(&name, kv->key, (void*)&(kv->data.bo), OPAL_BYTE_OBJECT))) { + ORTE_ERROR_LOG(rc); + return rc; + } + OBJ_RELEASE(kv); + } + OBJ_DESTRUCT(&modex_data); } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, @@ -573,12 +641,6 @@ static int modex(orte_grpcomm_collective_t *coll) return rc; } -static int purge_proc_attrs(void) -{ - /* nothing to do here */ - return ORTE_SUCCESS; -} - static inline unsigned char pmi_base64_encsym (unsigned char value) { assert (value < 64); diff --git a/orte/mca/odls/base/base.h b/orte/mca/odls/base/base.h index 13d5000e6b..52ad8bf24b 100644 --- a/orte/mca/odls/base/base.h +++ b/orte/mca/odls/base/base.h @@ -76,9 +76,6 @@ ORTE_DECLSPEC int orte_odls_base_select(void); ORTE_DECLSPEC int orte_odls_base_finalize(void); ORTE_DECLSPEC int orte_odls_base_close(void); -/* setup singleton job data */ -ORTE_DECLSPEC void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid); - #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 1cc0d364da..7d8c474ae0 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -294,7 +294,11 @@ static int check_local_proc(orte_job_t *jdata, orte_proc_t *pptr) OPAL_OUTPUT_VERBOSE((20, orte_odls_globals.output, "%s odls:constructing child list - looking for daemon for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name))); - if (ORTE_VPID_INVALID == (host_daemon = orte_ess.proc_get_daemon(&pptr->name))) { + if (NULL == pptr->node || NULL == pptr->node->daemon) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (ORTE_VPID_INVALID == (host_daemon = pptr->node->daemon->name.vpid)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } @@ -535,7 +539,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data, } opal_dss.copy((void**)&jdata->pmap, bo, OPAL_BYTE_OBJECT); /* decode the pidmap - this will also free the bytes in bo */ - if (ORTE_SUCCESS != (rc = orte_ess.update_pidmap(bo))) { + if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_pidmap(bo))) { ORTE_ERROR_LOG(rc); goto REPORT_ERROR; } @@ -800,8 +804,6 @@ static int odls_base_default_setup_fork(orte_app_context_t *context, static int setup_child(orte_proc_t *child, orte_job_t *jobdat, char ***env) { char *param, *value; - orte_node_rank_t node_rank; - orte_local_rank_t local_rank; int rc; /* setup the jobid */ @@ -849,12 +851,12 @@ static int setup_child(orte_proc_t *child, orte_job_t *jobdat, char ***env) * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(&child->name))) { + if (ORTE_LOCAL_RANK_INVALID == child->local_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } - asprintf(&value, "%lu", (unsigned long) local_rank); + asprintf(&value, "%lu", (unsigned long) child->local_rank); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, env); free(value); @@ -865,12 +867,12 @@ static int setup_child(orte_proc_t *child, orte_job_t *jobdat, char ***env) * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(&child->name))) { + if (ORTE_NODE_RANK_INVALID ==child->node_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } - asprintf(&value, "%lu", (unsigned long) node_rank); + asprintf(&value, "%lu", (unsigned long) child->node_rank); opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, env); /* set an mca param for it too */ if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) { @@ -1600,87 +1602,6 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i return ORTE_ERR_NOT_FOUND; } -void orte_odls_base_setup_singleton_jobdat(orte_jobid_t jobid) -{ - orte_job_t *jobdat; - orte_vpid_t vpid1; - int32_t one32; - orte_local_rank_t lrank; - orte_node_rank_t nrank; - orte_proc_state_t state; - orte_app_idx_t app_idx; - opal_buffer_t buffer; - opal_byte_object_t *bo; - int rc; -#if OPAL_HAVE_HWLOC - opal_hwloc_level_t bind_level; - unsigned int bind_idx; -#endif - - /* create a job tracking object for it */ - jobdat = OBJ_NEW(orte_job_t); - jobdat->jobid = jobid; - jobdat->num_procs = 1; - jobdat->num_local_procs = 1; - opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jobid), jobdat); - /* need to setup a pidmap for it */ - OBJ_CONSTRUCT(&buffer, opal_buffer_t); - opal_dss.pack(&buffer, &jobid, 1, ORTE_JOBID); /* jobid */ - vpid1 = 1; - opal_dss.pack(&buffer, &vpid1, 1, ORTE_VPID); /* num_procs */ -#if OPAL_HAVE_HWLOC - bind_level = OPAL_HWLOC_NODE_LEVEL; - opal_dss.pack(&buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* binding level */ -#endif - one32 = 0; - opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* node index */ - lrank = 0; - opal_dss.pack(&buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */ - nrank = 0; - opal_dss.pack(&buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */ -#if OPAL_HAVE_HWLOC - bind_idx = 0; - opal_dss.pack(&buffer, &bind_idx, 1, OPAL_UINT); /* bind index */ -#endif - state = ORTE_PROC_STATE_RUNNING; - opal_dss.pack(&buffer, &state, 1, ORTE_PROC_STATE); /* proc state */ - app_idx = 0; - opal_dss.pack(&buffer, &app_idx, 1, ORTE_APP_IDX); /* app index */ - one32 = 0; - opal_dss.pack(&buffer, &one32, 1, OPAL_INT32); /* restarts */ - /* setup a byte object and unload the packed data to it */ - bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); - opal_dss.unload(&buffer, (void**)&bo->bytes, &bo->size); - OBJ_DESTRUCT(&buffer); - /* save a copy to send back to the proc */ - opal_dss.copy((void**)&jobdat->pmap, bo, OPAL_BYTE_OBJECT); - /* update our ess data - this will release the byte object's data */ - if (ORTE_SUCCESS != (rc = orte_ess.update_pidmap(bo))) { - ORTE_ERROR_LOG(rc); - } - free(bo); - - /* if we don't yet have a daemon map, then we have to generate one - * to pass back to it - */ - if (NULL == orte_odls_globals.dmap) { - orte_odls_globals.dmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); - /* construct a nodemap */ - if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(orte_odls_globals.dmap))) { - ORTE_ERROR_LOG(rc); - } - /* we also need to update our local nidmap - copy the dmap - * as this will release the byte object's data. The copy function - * will automatically malloc the bo itself, so we don't need to do so here - */ - opal_dss.copy((void**)&bo, orte_odls_globals.dmap, OPAL_BYTE_OBJECT); - if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { - ORTE_ERROR_LOG(rc); - } - free(bo); - } -} - int orte_odls_base_default_require_sync(orte_process_name_t *proc, opal_buffer_t *buf, bool drop_nidmap) diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 0706586134..9710ecbc05 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -684,7 +684,7 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t */ orte_node_rank_t nrank; /* do I know my node_local_rank yet? */ - if (ORTE_NODE_RANK_INVALID != (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME)) && + if (ORTE_NODE_RANK_INVALID != (nrank = orte_process_info.my_node_rank) && (nrank+1) < opal_argv_count(mca_oob_tcp_component.tcp4_static_ports)) { /* any daemon takes the first entry, so we start with the second */ opal_argv_append_nosize(&ports, mca_oob_tcp_component.tcp4_static_ports[nrank+1]); @@ -1647,7 +1647,7 @@ int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer) * to compute the address and port */ if (orte_static_ports) { - if (NULL != (host = orte_ess.proc_get_hostname(&peer->peer_name))) { + if (NULL != (host = orte_get_proc_hostname(&peer->peer_name))) { /* lookup the address of this node */ if (NULL == (h = gethostbyname(host))) { /* this isn't an error - it just means we don't know @@ -1694,7 +1694,7 @@ int mca_oob_tcp_resolve(mca_oob_tcp_peer_t* peer) port = strtol(mca_oob_tcp_component.tcp4_static_ports[0], NULL, 10); } else { /* lookup the node rank of the proc */ - if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(&peer->peer_name)) || + if (ORTE_NODE_RANK_INVALID == (nrank = orte_get_proc_node_rank(&peer->peer_name)) || (nrank+1) > opal_argv_count(mca_oob_tcp_component.tcp4_static_ports)) { /* this isn't an error - it just means we don't know * how to compute a contact info for this proc diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 67752972ba..4bd24915ae 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -635,7 +635,7 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) mca_oob_tcp_msg_t *msg; char *host; - host = orte_ess.proc_get_hostname(&(peer->peer_name)); + host = orte_get_proc_hostname(&(peer->peer_name)); opal_output(0, "%s -> %s (node: %s) oob-tcp: Number of attempts to create TCP connection has been exceeded. Can not communicate with peer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->peer_name)), diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 13178a94e8..b6517b4446 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -762,7 +762,7 @@ static int remote_spawn(opal_buffer_t *launch) goto cleanup; } /* update our nidmap - this will free data in the byte object */ - if (ORTE_SUCCESS != (rc = orte_ess.update_nidmap(bo))) { + if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(bo))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -817,7 +817,7 @@ static int remote_spawn(opal_buffer_t *launch) target.vpid = child->name.vpid; /* get the host where this daemon resides */ - if (NULL == (hostname = orte_ess.proc_get_hostname(&target))) { + if (NULL == (hostname = orte_get_proc_hostname(&target))) { opal_output(0, "%s unable to get hostname for daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->name.vpid)); rc = ORTE_ERR_NOT_FOUND; diff --git a/orte/mca/routed/base/routed_base_fns.c b/orte/mca/routed/base/routed_base_fns.c index 89c503dee3..232227112c 100644 --- a/orte/mca/routed/base/routed_base_fns.c +++ b/orte/mca/routed/base/routed_base_fns.c @@ -90,6 +90,8 @@ void orte_routed_base_coll_relay_routing(orte_grpcomm_collective_t *coll) opal_list_item_t *item, *itm; orte_namelist_t *nm, *n2, *n3; bool dup; + orte_job_t *jdata; + orte_proc_t *proc; if (ORTE_PROC_IS_HNP) { /* nobody to send to */ @@ -112,7 +114,13 @@ void orte_routed_base_coll_relay_routing(orte_grpcomm_collective_t *coll) if (ORTE_VPID_WILDCARD == n2->name.vpid) { nm->name.vpid = ORTE_PROC_MY_PARENT->vpid; } else { - nm->name.vpid = orte_ess.proc_get_daemon(&n2->name); + jdata = orte_get_job_data_object(n2->name.jobid); + proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n2->name.vpid); + if (NULL == proc || NULL == proc->node || NULL == proc->node->daemon) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + nm->name.vpid = proc->node->daemon->name.vpid; } /* if it is me, then ignore */ if (nm->name.vpid == ORTE_PROC_MY_NAME->vpid) { diff --git a/orte/mca/routed/binomial/routed_binomial.c b/orte/mca/routed/binomial/routed_binomial.c index e4165271eb..3e0244aca0 100644 --- a/orte/mca/routed/binomial/routed_binomial.c +++ b/orte/mca/routed/binomial/routed_binomial.c @@ -394,7 +394,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ - if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { + if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { /*ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);*/ ret = ORTE_NAME_INVALID; goto found; @@ -814,7 +814,7 @@ static bool route_is_defined(const orte_process_name_t *target) } /* find out what daemon hosts this proc */ - if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { + if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) { return false; } diff --git a/orte/mca/routed/debruijn/routed_debruijn.c b/orte/mca/routed/debruijn/routed_debruijn.c index be8f4702c4..d669e4861d 100644 --- a/orte/mca/routed/debruijn/routed_debruijn.c +++ b/orte/mca/routed/debruijn/routed_debruijn.c @@ -403,7 +403,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) ret.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ - if (ORTE_VPID_INVALID == (ret.vpid = orte_ess.proc_get_daemon(target))) { + if (ORTE_VPID_INVALID == (ret.vpid = orte_get_proc_daemon_vpid(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = *ORTE_NAME_INVALID; break; @@ -793,7 +793,7 @@ static bool route_is_defined(const orte_process_name_t *target) } /* find out what daemon hosts this proc */ - if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { + if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) { return false; } diff --git a/orte/mca/routed/radix/routed_radix.c b/orte/mca/routed/radix/routed_radix.c index 965279207c..0c944083a9 100644 --- a/orte/mca/routed/radix/routed_radix.c +++ b/orte/mca/routed/radix/routed_radix.c @@ -397,7 +397,7 @@ static orte_process_name_t get_route(orte_process_name_t *target) daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ - if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { + if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; @@ -816,7 +816,7 @@ static bool route_is_defined(const orte_process_name_t *target) } /* find out what daemon hosts this proc */ - if (ORTE_VPID_INVALID == orte_ess.proc_get_daemon((orte_process_name_t*)target)) { + if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) { return false; } diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index c0b639f8d0..3a6593f776 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -889,13 +889,13 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, proc.jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, proc.jobid); if (ORTE_PROC_IS_HNP) { return_addr = sender; + proc2.jobid = ORTE_PROC_MY_NAME->jobid; /* if the request is for a wildcard vpid, then it goes to every * daemon. For scalability, we should probably xcast this some * day - but for now, we just loop */ if (ORTE_VPID_WILDCARD == proc.vpid) { /* loop across all daemons */ - proc2.jobid = ORTE_PROC_MY_NAME->jobid; for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) { /* setup the cmd */ @@ -935,7 +935,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, /* this is for a single proc - see which daemon * this rank is on */ - if (ORTE_VPID_INVALID == (proc2.vpid = orte_ess.proc_get_daemon(&proc))) { + if (ORTE_VPID_INVALID == (proc2.vpid = orte_get_proc_daemon_vpid(&proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto SEND_TOP_ANSWER; } @@ -955,7 +955,6 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } - proc2.jobid = ORTE_PROC_MY_NAME->jobid; if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 6b8c3c36d8..55e3d6ac5f 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -484,20 +484,14 @@ int orte_daemon(int argc, char *argv[]) orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; - int32_t ljob, one32; - orte_vpid_t vpid1; - orte_local_rank_t lrank; - orte_node_rank_t nrank; - opal_byte_object_t *bo; - orte_proc_state_t state; - orte_app_idx_t app_idx; + int32_t ljob; /* setup the singleton's job */ jdata = OBJ_NEW(orte_job_t); orte_plm_base_create_jobid(jdata); ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); - + /* must create a map for it (even though it has no * info in it) so that the job info will be picked * up in subsequent pidmaps or other daemons won't @@ -535,64 +529,40 @@ int orte_daemon(int argc, char *argv[]) proc->alive = true; proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; - /* obviously, they are on my node */ + /* obviously, it is on my node */ node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); proc->node = node; OBJ_RETAIN(node); /* keep accounting straight */ opal_pointer_array_add(jdata->procs, proc); jdata->num_procs = 1; - /* and obviously they are one of my local procs */ + /* and obviously it is one of my local procs */ OBJ_RETAIN(proc); opal_pointer_array_add(orte_local_children, proc); jdata->num_local_procs = 1; - + /* set the trivial */ + proc->local_rank = 0; + proc->node_rank = 0; + proc->app_rank = 0; + proc->state = ORTE_PROC_STATE_RUNNING; + proc->alive = true; + proc->app_idx = 0; + proc->local_proc = true; +#if OPAL_HAVE_HWLOC + proc->bind_idx = 0; +#endif + /* the singleton will use the first three collectives * for its modex/barriers */ orte_grpcomm_base.coll_id += 3; /* need to setup a pidmap for it */ - buffer = OBJ_NEW(opal_buffer_t); - opal_dss.pack(buffer, &jdata->jobid, 1, ORTE_JOBID); /* jobid */ - vpid1 = 1; - opal_dss.pack(buffer, &vpid1, 1, ORTE_VPID); /* num_procs */ -#if OPAL_HAVE_HWLOC - { - opal_hwloc_level_t bind_level; - bind_level = OPAL_HWLOC_NODE_LEVEL; - opal_dss.pack(buffer, &bind_level, 1, OPAL_HWLOC_LEVEL_T); /* num_procs */ - } -#endif - one32 = 0; - opal_dss.pack(buffer, &one32, 1, OPAL_INT32); /* node index */ - lrank = 0; - opal_dss.pack(buffer, &lrank, 1, ORTE_LOCAL_RANK); /* local rank */ - nrank = 0; - opal_dss.pack(buffer, &nrank, 1, ORTE_NODE_RANK); /* node rank */ -#if OPAL_HAVE_HWLOC - { - uint bind_idx; - bind_idx = 0; - opal_dss.pack(buffer, &bind_idx, 1, OPAL_UINT); /* bind index */ - } -#endif - state = ORTE_PROC_STATE_RUNNING; - opal_dss.pack(buffer, &state, 1, ORTE_PROC_STATE); /* proc state */ - app_idx = 0; - opal_dss.pack(buffer, &app_idx, 1, ORTE_APP_IDX); /* app index */ - one32 = 0; - opal_dss.pack(buffer, &one32, 1, OPAL_INT32); /* restarts */ - /* setup a byte object and unload the packed data to it */ - bo = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); - opal_dss.unload(buffer, (void**)&bo->bytes, &bo->size); - OBJ_RELEASE(buffer); - /* save a copy to send back to the proc */ - opal_dss.copy((void**)&jdata->pmap, bo, OPAL_BYTE_OBJECT); - /* update our ess data - this will release the byte object's data */ - if (ORTE_SUCCESS != (ret = orte_ess.update_pidmap(bo))) { + jdata->pmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); + if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(jdata->pmap))) { ORTE_ERROR_LOG(ret); + goto DONE; } - free(bo); + /* if we don't yet have a daemon map, then we have to generate one * to pass back to it @@ -602,14 +572,7 @@ int orte_daemon(int argc, char *argv[]) /* construct a nodemap */ if (ORTE_SUCCESS != (ret = orte_util_encode_nodemap(orte_odls_globals.dmap))) { ORTE_ERROR_LOG(ret); - } - /* we also need to update our local nidmap - copy the dmap - * as this will release the byte object's data. The copy function - * will automatically malloc the bo itself, so we don't need to do so here - */ - opal_dss.copy((void**)&bo, orte_odls_globals.dmap, OPAL_BYTE_OBJECT); - if (ORTE_SUCCESS != (ret = orte_ess.update_nidmap(bo))) { - ORTE_ERROR_LOG(ret); + goto DONE; } } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 42d7c38321..27c5da6369 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -37,6 +37,7 @@ #include "opal/dss/dss.h" #include "opal/threads/threads.h" +#include "orte/mca/db/db.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/rml.h" #include "orte/util/proc_info.h" @@ -126,10 +127,6 @@ opal_pointer_array_t *orte_node_topologies; opal_pointer_array_t *orte_local_children; uint16_t orte_num_jobs = 0; -/* Nidmap and job maps */ -opal_pointer_array_t orte_nidmap; -opal_pointer_array_t orte_jobmap; - /* IOF controls */ bool orte_tag_output; bool orte_timestamp_output; @@ -450,6 +447,88 @@ orte_job_t* orte_get_job_data_object(orte_jobid_t job) return (orte_job_t*)opal_pointer_array_get_item(orte_job_data, ljob); } +orte_proc_t* orte_get_proc_object(orte_process_name_t *proc) +{ + orte_job_t *jdata; + orte_proc_t *proct; + + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + return NULL; + } + proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + return proct; +} + +orte_vpid_t orte_get_proc_daemon_vpid(orte_process_name_t *proc) +{ + orte_job_t *jdata; + orte_proc_t *proct; + + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + return ORTE_VPID_INVALID; + } + if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { + return ORTE_VPID_INVALID; + } + if (NULL == proct->node || NULL == proct->node->daemon) { + return ORTE_VPID_INVALID; + } + return proct->node->daemon->name.vpid; +} + +char* orte_get_proc_hostname(orte_process_name_t *proc) +{ + orte_proc_t *proct; + char *hostname; + int rc; + + if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { + /* look it up on our arrays */ + if (NULL == (proct = orte_get_proc_object(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return NULL; + } + if (NULL == proct->node || NULL == proct->node->name) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return NULL; + } + return proct->node->name; + } + + /* if we are an app, get the pointer from the modex db */ + if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(proc, ORTE_DB_HOSTNAME, + (void**)&hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return NULL; + } + return hostname; +} + +orte_node_rank_t orte_get_proc_node_rank(orte_process_name_t *proc) +{ + orte_proc_t *proct; + orte_node_rank_t noderank, *nr; + int rc; + + if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { + /* look it up on our arrays */ + if (NULL == (proct = orte_get_proc_object(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_NODE_RANK_INVALID; + } + return proct->node_rank; + } + + /* if we are an app, get the value from the modex db */ + nr = &noderank; + if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(proc, ORTE_DB_NODERANK, + (void**)&nr, ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + return ORTE_NODE_RANK_INVALID; + } + return noderank; +} + orte_vpid_t orte_get_lowest_vpid_alive(orte_jobid_t job) { int i; @@ -905,75 +984,6 @@ OBJ_CLASS_INSTANCE(orte_proc_t, orte_proc_construct, orte_proc_destruct); -static void orte_nid_construct(orte_nid_t *ptr) -{ - ptr->name = NULL; - ptr->daemon = ORTE_VPID_INVALID; - ptr->oversubscribed = false; -} - -static void orte_nid_destruct(orte_nid_t *ptr) -{ - if (NULL != ptr->name) { - free(ptr->name); - ptr->name = NULL; - } -} - -OBJ_CLASS_INSTANCE(orte_nid_t, - opal_object_t, - orte_nid_construct, - orte_nid_destruct); - -static void orte_pmap_construct(orte_pmap_t *ptr) -{ - ptr->node = -1; - ptr->local_rank = ORTE_LOCAL_RANK_INVALID; - ptr->node_rank = ORTE_NODE_RANK_INVALID; -#if OPAL_HAVE_HWLOC - ptr->bind_idx = 0; - ptr->locality = OPAL_PROC_LOCALITY_UNKNOWN; -#endif -} -OBJ_CLASS_INSTANCE(orte_pmap_t, - opal_object_t, - orte_pmap_construct, - NULL); - - -static void orte_jmap_construct(orte_jmap_t *ptr) -{ - ptr->job = ORTE_JOBID_INVALID; - ptr->num_procs = 0; -#if OPAL_HAVE_HWLOC - ptr->bind_level = OPAL_HWLOC_NODE_LEVEL; -#endif - OBJ_CONSTRUCT(&ptr->pmap, opal_pointer_array_t); - opal_pointer_array_init(&ptr->pmap, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE); -} - -static void orte_jmap_destruct(orte_jmap_t *ptr) -{ - orte_pmap_t *pmap; - int i; - - for (i=0; i < ptr->pmap.size; i++) { - if (NULL != (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&ptr->pmap, i))) { - OBJ_RELEASE(pmap); - } - } - OBJ_DESTRUCT(&ptr->pmap); -} - -OBJ_CLASS_INSTANCE(orte_jmap_t, - opal_object_t, - orte_jmap_construct, - orte_jmap_destruct); - - static void orte_job_map_construct(orte_job_map_t* map) { map->req_mapper = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index e594e86b6a..232ca57a87 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -532,56 +532,8 @@ struct orte_proc_t { typedef struct orte_proc_t orte_proc_t; ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_proc_t); -typedef struct { - /* base object */ - opal_object_t super; - /* index in the array */ - int index; - /* nodename */ - char *name; - /* vpid of this job family's daemon on this node */ - orte_vpid_t daemon; - /* whether or not this node is oversubscribed */ - bool oversubscribed; -} orte_nid_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_nid_t); - -typedef struct { - /* base object */ - opal_object_t super; - /* index to node */ - int32_t node; - /* local rank */ - orte_local_rank_t local_rank; - /* node rank */ - orte_node_rank_t node_rank; -#if OPAL_HAVE_HWLOC - /* bind index */ - unsigned int bind_idx; - /* locality */ - opal_hwloc_locality_t locality; -#endif -} orte_pmap_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_pmap_t); - -typedef struct { - /* base object */ - opal_object_t super; - /* jobid */ - orte_jobid_t job; - /* number of procs in this job */ - orte_vpid_t num_procs; -#if OPAL_HAVE_HWLOC - /* binding level of the job */ - opal_hwloc_level_t bind_level; -#endif - /* array of data for procs */ - opal_pointer_array_t pmap; -} orte_jmap_t; -ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_jmap_t); - /** -* Get a job data object + * Get a job data object * We cannot just reference a job data object with its jobid as * the jobid is no longer an index into the array. This change * was necessitated by modification of the jobid to include @@ -590,6 +542,22 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_jmap_t); */ ORTE_DECLSPEC orte_job_t* orte_get_job_data_object(orte_jobid_t job); +/** + * Get a proc data object + */ +ORTE_DECLSPEC orte_proc_t* orte_get_proc_object(orte_process_name_t *proc); + +/** + * Get the daemon vpid hosting a given proc + */ +ORTE_DECLSPEC orte_vpid_t orte_get_proc_daemon_vpid(orte_process_name_t *proc); + +/* Get the hostname of a proc */ +ORTE_DECLSPEC char* orte_get_proc_hostname(orte_process_name_t *proc); + +/* get the node rank of a proc */ +ORTE_DECLSPEC orte_node_rank_t orte_get_proc_node_rank(orte_process_name_t *proc); + /* Find the lowest vpid alive in a given job */ ORTE_DECLSPEC orte_vpid_t orte_get_lowest_vpid_alive(orte_jobid_t job); @@ -661,10 +629,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies; ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children; ORTE_DECLSPEC extern uint16_t orte_num_jobs; -/* Nidmap and job maps */ -ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap; -ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap; - /* whether or not to forward SIGTSTP and SIGCONT signals */ ORTE_DECLSPEC extern bool orte_forward_job_control; diff --git a/orte/util/name_fns.h b/orte/util/name_fns.h index ad3acd90cd..9c859fdff9 100644 --- a/orte/util/name_fns.h +++ b/orte/util/name_fns.h @@ -99,6 +99,10 @@ ORTE_DECLSPEC char *orte_pretty_print_timing(int64_t secs, int64_t usecs); #define ORTE_JOBID_IS_DAEMON(n) \ !((n) & 0x0000ffff) +/* a macro for obtaining the daemon jobid */ +#define ORTE_DAEMON_JOBID(n) \ + ((n) & 0xffff0000) + /* List of names for general use */ struct orte_namelist_t { opal_list_item_t super; /**< Allows this item to be placed on a list */ diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 173877f643..bf83e3edad 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -52,7 +52,9 @@ #include "opal/mca/hwloc/base/base.h" #include "opal/util/output.h" #include "opal/util/argv.h" +#include "opal/datatype/opal_datatype.h" +#include "orte/mca/db/db.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/odls/base/odls_private.h" #include "orte/util/show_help.h" @@ -65,28 +67,12 @@ #include "orte/util/nidmap.h" -static bool initialized = false; - int orte_util_nidmap_init(opal_buffer_t *buffer) { int32_t cnt; int rc; opal_byte_object_t *bo; - if (!initialized) { - /* need to construct the global arrays */ - /* setup the nidmap array */ - OBJ_CONSTRUCT(&orte_nidmap, opal_pointer_array_t); - opal_pointer_array_init(&orte_nidmap, 8, INT32_MAX, 8); - - /* setup array of jmaps */ - OBJ_CONSTRUCT(&orte_jobmap, opal_pointer_array_t); - opal_pointer_array_init(&orte_jobmap, 1, INT32_MAX, 1); - - /* make sure we don't do this twice */ - initialized = true; - } - /* it is okay if the buffer is empty */ if (NULL == buffer || 0 == buffer->bytes_used) { return ORTE_SUCCESS; @@ -141,31 +127,6 @@ int orte_util_nidmap_init(opal_buffer_t *buffer) void orte_util_nidmap_finalize(void) { - orte_nid_t *nid; - orte_jmap_t *jmap; - int32_t i; - - if (!initialized) { - /* nothing to do */ - return; - } - - /* deconstruct the global nidmap and jobmap arrays */ - for (i=0; i < orte_nidmap.size; i++) { - if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - continue; - } - OBJ_RELEASE(nid); - } - OBJ_DESTRUCT(&orte_nidmap); - for (i=0; i < orte_jobmap.size; i++) { - if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) { - continue; - } - OBJ_RELEASE(jmap); - } - OBJ_DESTRUCT(&orte_jobmap); - #if OPAL_HAVE_HWLOC /* destroy the topology */ if (NULL != opal_hwloc_topology) { @@ -173,42 +134,11 @@ void orte_util_nidmap_finalize(void) opal_hwloc_topology = NULL; } #endif - - /* flag that these are no longer initialized */ - initialized = false; -} - -int orte_util_setup_local_nidmap_entries(void) -{ - orte_nid_t *node; - orte_jmap_t *jmap; - orte_pmap_t *pmap; - - /* add a jmap entry for myself */ - jmap = OBJ_NEW(orte_jmap_t); - jmap->job = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_add(&orte_jobmap, jmap); - jmap->num_procs = 1; - - /* create a nidmap entry for this node */ - node = OBJ_NEW(orte_nid_t); - node->name = strdup(orte_process_info.nodename); - node->daemon = ORTE_PROC_MY_DAEMON->vpid; - pmap = OBJ_NEW(orte_pmap_t); - pmap->local_rank = 0; - pmap->node_rank = 0; - node->index = opal_pointer_array_add(&orte_nidmap, node); - pmap->node = node->index; - opal_pointer_array_set_item(&jmap->pmap, ORTE_PROC_MY_NAME->vpid, pmap); - - /* all done */ - return ORTE_SUCCESS; } #if ORTE_ENABLE_STATIC_PORTS int orte_util_build_daemon_nidmap(char **nodes) { - orte_nid_t *node; int i, num_nodes; int rc; struct hostent *h; @@ -228,38 +158,42 @@ int orte_util_build_daemon_nidmap(char **nodes) return ORTE_SUCCESS; } - /* set the size of the nidmap storage so we minimize realloc's */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes+1))) { + /* install the entry for the HNP */ + proc.jobid = ORTE_PROC_MY_NAME->jobid; + proc.vpid = 0; + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &proc.vpid, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return rc; } - - /* install the entry for the HNP */ - node = OBJ_NEW(orte_nid_t); - node->name = strdup("HNP"); - node->daemon = 0; - /* the arch defaults to our arch so that non-hetero - * case will yield correct behavior - */ - opal_pointer_array_set_item(&orte_nidmap, 0, node); - + addr = "HNP"; + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, addr, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* the daemon vpids will be assigned in order, * starting with vpid=1 for the first node in * the list */ OBJ_CONSTRUCT(&buf, opal_buffer_t); - proc.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; i < num_nodes; i++) { - node = OBJ_NEW(orte_nid_t); - node->name = strdup(nodes[i]); - node->daemon = i+1; + /* define the vpid for this daemon */ + proc.vpid = i+1; + /* store the hostname for the proc */ + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, nodes[i], OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* the arch defaults to our arch so that non-hetero * case will yield correct behavior */ - opal_pointer_array_set_item(&orte_nidmap, node->daemon, node); + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_ARCH, &opal_local_arch, OPAL_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* lookup the address of this node */ - if (NULL == (h = gethostbyname(node->name))) { + if (NULL == (h = gethostbyname(nodes[i]))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } @@ -271,15 +205,14 @@ int orte_util_build_daemon_nidmap(char **nodes) * define the info necessary for opening such a port if/when I communicate * to them */ - /* construct the URI */ - proc.vpid = node->daemon; + /* construct the URI */ orte_util_convert_process_name_to_string(&proc_name, &proc); asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s orte:util:build:daemon:nidmap node %s daemon %d addr %s uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, (int)node->daemon, addr, uri)); + nodes[i], i+1, addr, uri)); opal_dss.pack(&buf, &uri, 1, OPAL_STRING); free(proc_name); free(uri); @@ -297,14 +230,12 @@ int orte_util_build_daemon_nidmap(char **nodes) int orte_util_encode_nodemap(opal_byte_object_t *boptr) { - orte_vpid_t *vpids; - orte_node_t *node, *hnp; + orte_vpid_t vpid; + orte_node_t *node; int32_t i, num_nodes; int rc; - char *nodename; opal_buffer_t buf; - char *ptr; - uint8_t *oversub=NULL; + char *ptr, *nodename; /* setup a buffer for tmp use */ OBJ_CONSTRUCT(&buf, opal_buffer_t); @@ -324,16 +255,22 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr) return rc; } - /* the HNP always has an entry at posn 0 - get its pointer as - * we will need it later - */ - hnp = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - - /* pack every nodename individually */ + /* pack the data for each node by daemon */ for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } + if (NULL == node->daemon) { + /* some nodes may not have daemons on them */ + vpid = ORTE_VPID_INVALID; + } else { + vpid = node->daemon->name.vpid; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the name of the node */ if (!orte_keep_fqdn_hostnames) { nodename = strdup(node->name); if (NULL != (ptr = strchr(nodename, '.'))) { @@ -350,45 +287,13 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr) return rc; } } + /* pack the oversubscribed flag */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node->oversubscribed, 1, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } } - /* since the daemon vpids may not correspond to the node - * index, we need to also pack the vpid array for all - * daemons. This scenario can happen when the user is - * employing a mapping algo that doesn't use all allocated - * nodes, and sprinkles procs across them in some non-contig - * manner. For example, use of the seq mapper where only - * some nodes are used, and where the usage leaves "holes" - * in the node array, will cause the daemon vpids to not - * match their node array index - */ - - /* allocate space for the daemon vpids and oversubscribed flags */ - vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t)); - oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t)); - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - if (NULL == node->daemon) { - /* some nodes may not have daemons on them */ - vpids[i] = ORTE_VPID_INVALID; - continue; - } - vpids[i] = node->daemon->name.vpid; - oversub[i] = node->oversubscribed; - } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, vpids, num_nodes, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - free(vpids); - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, oversub, num_nodes, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - free(oversub); - /* transfer the payload to the byte object */ opal_dss.unload(&buf, (void**)&boptr->bytes, &boptr->size); OBJ_DESTRUCT(&buf); @@ -396,36 +301,21 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr) return ORTE_SUCCESS; } +/* decode a nodemap for an application process */ int orte_util_decode_nodemap(opal_byte_object_t *bo) { int n; int32_t num_nodes, i, num_daemons; - orte_nid_t *node; - orte_vpid_t *vpids; - orte_nid_t *nd, *ndptr; + orte_process_name_t daemon; opal_buffer_t buf; int rc; - uint8_t *oversub; + uint8_t oversub; + char *nodename; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s decode:nidmap decoding nodemap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* if there are any entries already in the node array, clear it out */ - if (0 < orte_nidmap.size) { - /* unfortunately, the opal function "remove_all" doesn't release - * the memory pointed to by the elements in the array, so we need - * to release those first - */ - for (i=0; i < orte_nidmap.size; i++) { - if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - OBJ_RELEASE(ndptr); - } - } - /* now use the opal function to reset the internal pointers */ - opal_pointer_array_remove_all(&orte_nidmap); - } - /* xfer the byte object to a buffer for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.load(&buf, bo->bytes, bo->size); @@ -441,96 +331,62 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo) "%s decode:nidmap decoding %d nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes)); - /* set the size of the nidmap storage so we minimize realloc's */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&orte_nidmap, num_nodes))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* loop over nodes and unpack the raw nodename */ + /* set the daemon jobid */ + daemon.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); + + num_daemons = 0; for (i=0; i < num_nodes; i++) { - node = OBJ_NEW(orte_nid_t); - /* the arch defaults to our arch so that non-hetero - * case will yield correct behavior - */ - opal_pointer_array_set_item(&orte_nidmap, i, node); - - /* unpack the node's name */ + /* unpack the daemon vpid */ n=1; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &daemon.vpid, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_VPID_INVALID != daemon.vpid) { + ++num_daemons; + } + /* unpack and store the node's name */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &nodename, &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = orte_db.store(&daemon, ORTE_DB_HOSTNAME, nodename, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* if this is my daemon, then store the data for me too */ + if (daemon.vpid == ORTE_PROC_MY_DAEMON->vpid) { + if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_HOSTNAME, nodename, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_DAEMON_VPID, &daemon.vpid, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* unpack and discard the oversubscribed flag - procs don't need it */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); return rc; } } - /* unpack the daemon vpids */ - vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t)); - n=num_nodes; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, vpids, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* unpack the oversubscribed flags */ - oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t)); - n=num_nodes; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, oversub, &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* transfer the data to the nidmap, counting the number of - * daemons in the system - */ - num_daemons = 0; - for (i=0; i < num_nodes; i++) { - if (NULL != (ndptr = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - ndptr->daemon = vpids[i]; - if (0 == oversub[i]) { - ndptr->oversubscribed = false; - } else { - ndptr->oversubscribed = true; - } - if (ORTE_VPID_INVALID != vpids[i]) { - ++num_daemons; - } - } - } - free(vpids); - free(oversub); - - /* if we are a daemon or the HNP, update our num_procs */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { - orte_process_info.num_procs = num_daemons; - - if (orte_process_info.max_procs < orte_process_info.num_procs) { - orte_process_info.max_procs = orte_process_info.num_procs; - } - } /* update num_daemons */ orte_process_info.num_daemons = num_daemons; - if (0 < opal_output_get_verbosity(orte_debug_output)) { - for (i=0; i < num_nodes; i++) { - if (NULL == (nd = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - continue; - } - opal_output(5, "%s node[%d].name %s daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, - (NULL == nd->name) ? "NULL" : nd->name, - ORTE_VPID_PRINT(nd->daemon)); - } - } - OBJ_DESTRUCT(&buf); return ORTE_SUCCESS; } +/* decode a nodemap for a daemon */ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) { int n; int32_t num_nodes, i; - orte_vpid_t *vpids; + orte_vpid_t vpid; orte_node_t *node; opal_buffer_t buf; int rc; @@ -558,15 +414,24 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) "%s decode:nidmap decoding %d nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_nodes)); - /* set the size of the nidmap storage so we minimize realloc's */ + /* set the size of the node pool storage so we minimize realloc's */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes))) { ORTE_ERROR_LOG(rc); return rc; } - /* loop over nodes and unpack the raw nodename */ + /* transfer the data to the nodes, counting the number of + * daemons in the system + */ + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); for (i=0; i < num_nodes; i++) { - /* unpack the node's name */ + /* unpack the daemon vpid */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &vpid, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack and store the node's name */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &name, &n, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -583,44 +448,21 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) } else { free(name); } - } - - /* unpack the daemon vpids */ - vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t)); - n=num_nodes; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, vpids, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* unpack the oversubscribed flags */ - oversub = (uint8_t*)malloc(num_nodes * sizeof(uint8_t)); - n=num_nodes; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, oversub, &n, OPAL_UINT8))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - /* transfer the data to the nodes, counting the number of - * daemons in the system - */ - daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - for (i=0; i < num_nodes; i++) { - if (ORTE_VPID_INVALID == vpids[i]) { + /* unpack the oversubscribed flag */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &oversub, &n, OPAL_UINT8))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_VPID_INVALID == vpid) { /* no daemon on this node */ continue; } - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - /* this is an error */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); - return ORTE_ERR_NOT_FOUND; - } - if (NULL == (dptr = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, vpids[i]))) { + if (NULL == (dptr = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, vpid))) { dptr = OBJ_NEW(orte_proc_t); dptr->name.jobid = ORTE_PROC_MY_NAME->jobid; - dptr->name.vpid = vpids[i]; - opal_pointer_array_set_item(daemons->procs, vpids[i], dptr); + dptr->name.vpid = vpid; + opal_pointer_array_set_item(daemons->procs, vpid, dptr); daemons->num_procs++; } if (NULL != node->daemon) { @@ -633,15 +475,13 @@ int orte_util_decode_daemon_nodemap(opal_byte_object_t *bo) } OBJ_RETAIN(node); dptr->node = node; - if (0 == oversub[i]) { + if (0 == oversub) { node->oversubscribed = false; } else { node->oversubscribed = true; } } - free(vpids); - free(oversub); - + orte_process_info.num_procs = daemons->num_procs; if (orte_process_info.max_procs < orte_process_info.num_procs) { @@ -674,7 +514,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) orte_local_rank_t *lrank = NULL; orte_node_rank_t *nrank = NULL; orte_job_t *jdata = NULL; - int32_t *nodes = NULL; + orte_vpid_t *daemons = NULL; int i, j, k, rc = ORTE_SUCCESS; #if OPAL_HAVE_HWLOC unsigned int *bind_idx=NULL; @@ -719,7 +559,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) #endif /* allocate memory for the nodes, local ranks, node ranks, and bind_idx */ - nodes = (int32_t*)malloc(jdata->num_procs * sizeof(int32_t)); + daemons = (orte_vpid_t*)malloc(jdata->num_procs * sizeof(orte_vpid_t)); lrank = (orte_local_rank_t*)malloc(jdata->num_procs*sizeof(orte_local_rank_t)); nrank = (orte_node_rank_t*)malloc(jdata->num_procs*sizeof(orte_node_rank_t)); states = (orte_proc_state_t*)malloc(jdata->num_procs*sizeof(orte_proc_state_t)); @@ -738,7 +578,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) true, jdata->num_procs); break; } - nodes[k] = proc->node->index; + daemons[k] = proc->node->daemon->name.vpid; lrank[k] = proc->local_rank; nrank[k] = proc->node_rank; states[k] = proc->state; @@ -749,7 +589,7 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) #endif ++k; } - if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, nodes, jdata->num_procs, OPAL_INT32))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, daemons, jdata->num_procs, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup_and_return; } @@ -798,8 +638,8 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) if( NULL != nrank ) { free(nrank); } - if( NULL != nodes ) { - free(nodes); + if( NULL != daemons ) { + free(daemons); } #if OPAL_HAVE_HWLOC if( NULL != bind_idx ) { @@ -823,26 +663,26 @@ int orte_util_encode_pidmap(opal_byte_object_t *boptr) /* only APPS call this function - daemons have their own */ int orte_util_decode_pidmap(opal_byte_object_t *bo) { - orte_jobid_t jobid; - orte_vpid_t i, num_procs; - orte_pmap_t *pmap; - int32_t *nodes=NULL; + orte_vpid_t i, num_procs, *vptr, daemon; + orte_vpid_t *daemons=NULL; orte_local_rank_t *local_rank=NULL; orte_node_rank_t *node_rank=NULL; #if OPAL_HAVE_HWLOC - opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL; - unsigned int *bind_idx=NULL; + opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr; + unsigned int *bind_idx=NULL, pbidx, *uiptr; + opal_hwloc_locality_t locality; #endif orte_std_cntr_t n; opal_buffer_t buf; - orte_jmap_t *jmap; - bool already_present; - int j, k; int rc; orte_proc_state_t *states = NULL; orte_app_idx_t *app_idx = NULL; int32_t *restarts = NULL; - + orte_process_name_t proc, dmn; + orte_namelist_t *nm; + opal_list_t jobs; + char *hostname; + /* xfer the byte object to a buffer for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) { @@ -852,59 +692,52 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) n = 1; /* cycle through the buffer */ - while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &jobid, &n, ORTE_JOBID))) { - /* unfortunately, job objects cannot be stored - * by index number as the jobid is a constructed - * value. So we have no choice but to cycle through - * the jobmap pointer array and look for this entry. Since - * jobs are cleaned up as they complete, check the - * entire array - */ + OBJ_CONSTRUCT(&jobs, opal_list_t); + while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &proc.jobid, &n, ORTE_JOBID))) { + /* record the jobid */ + nm = OBJ_NEW(orte_namelist_t); + nm->name.jobid = proc.jobid; + opal_list_append(&jobs, &nm->super); - jmap = NULL; - already_present = false; - for (j=0; j < orte_jobmap.size; j++) { - if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, j))) { - continue; - } - if (jobid == jmap->job) { - already_present = true; - break; - } - } - - /* unpack the number of procs */ + /* unpack and store the number of procs */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } + proc.vpid = ORTE_VPID_INVALID; + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NPROCS, &num_procs, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } #if OPAL_HAVE_HWLOC - /* unpack the binding level */ + /* unpack and store the binding level */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } + /* store it */ + proc.vpid = ORTE_VPID_INVALID; + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } /* set mine */ - if (jobid == ORTE_PROC_MY_NAME->jobid) { + if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { orte_process_info.bind_level = bind_level; } #endif - /* allocate memory for the node info */ - nodes = (int32_t*)malloc(num_procs * 4); + /* allocate memory for the daemon info */ + daemons = (orte_vpid_t*)malloc(num_procs * sizeof(orte_vpid_t)); /* unpack it in one shot */ n=num_procs; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, nodes, &n, OPAL_INT32))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, daemons, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } - if (jobid == ORTE_PROC_MY_NAME->jobid) { - /* track my node */ - orte_process_info.my_node = nodes[ORTE_PROC_MY_NAME->vpid]; - } /* allocate memory for local ranks */ local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t)); @@ -914,6 +747,15 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } + if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { + /* set mine */ + orte_process_info.my_local_rank = local_rank[ORTE_PROC_MY_NAME->vpid]; + if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK, + &orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } /* allocate memory for node ranks */ node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t)); @@ -923,6 +765,15 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } + if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { + /* set mine */ + orte_process_info.my_node_rank = node_rank[ORTE_PROC_MY_NAME->vpid]; + if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK, + &orte_process_info.my_node_rank, ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + } #if OPAL_HAVE_HWLOC /* allocate memory for bind_idx */ @@ -933,9 +784,14 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) ORTE_ERROR_LOG(rc); goto cleanup; } - if (jobid == ORTE_PROC_MY_NAME->jobid) { + if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid]; + if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX, + &orte_process_info.bind_idx, OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } } #endif @@ -975,68 +831,48 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) free(restarts); restarts = NULL; - /* if we already know about this job, we need to check the data to see - * if something has changed - e.g., a proc that is being restarted somewhere - * other than where it previously was - */ - if (already_present) { - /* we already have the jmap object, so let's refresh its pidmap - * using the new data - start by cleaning out the old array - */ - for (j=0; j < jmap->pmap.size; j++) { - if (NULL == (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, j))) { - continue; - } - OBJ_RELEASE(pmap); - } - /* now use the opal function to reset the internal pointers */ - opal_pointer_array_remove_all(&jmap->pmap); - } else { - /* if we don't already have this data, store it - * unfortunately, job objects cannot be stored - * by index number as the jobid is a constructed - * value. So we have to just add it to the end - * of the array - */ - jmap = OBJ_NEW(orte_jmap_t); - jmap->job = jobid; - if (0 > (j = opal_pointer_array_add(&orte_jobmap, jmap))) { - ORTE_ERROR_LOG(j); - rc = j; - goto cleanup; - } - } - /* update the binding level and num_procs */ -#if OPAL_HAVE_HWLOC - jmap->bind_level = bind_level; -#endif - jmap->num_procs = num_procs; - /* set the size of the storage so we minimize realloc's */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(&jmap->pmap, num_procs))) { - ORTE_ERROR_LOG(rc); - return rc; - } - + /* set the daemon jobid */ + dmn.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); + /* xfer the data */ for (i=0; i < num_procs; i++) { - pmap = OBJ_NEW(orte_pmap_t); - pmap->node = nodes[i]; - pmap->local_rank = local_rank[i]; - pmap->node_rank = node_rank[i]; -#if OPAL_HAVE_HWLOC - pmap->bind_idx = bind_idx[i]; -#endif - /* add the pidmap entry at the specific site corresponding - * to the proc's vpid - */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(&jmap->pmap, i, pmap))) { + if (proc.jobid == ORTE_PROC_MY_NAME->jobid && + i == ORTE_PROC_MY_NAME->vpid) { + continue; + } + proc.vpid = i; + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &daemons[i], ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } + /* lookup and store the hostname for this proc */ + dmn.vpid = daemons[i]; + if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank[i], ORTE_LOCAL_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank[i], ORTE_NODE_RANK))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } +#if OPAL_HAVE_HWLOC + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx[i], OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } +#endif } /* release data */ - free(nodes); - nodes = NULL; + free(daemons); + daemons = NULL; free(local_rank); local_rank = NULL; free(node_rank); @@ -1058,39 +894,71 @@ int orte_util_decode_pidmap(opal_byte_object_t *bo) * to know our own node, so go back and record the * locality of each proc relative to me */ - for (j=0; j < orte_jobmap.size; j++) { - if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, j))) { - continue; + while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&jobs))) { + proc.jobid = nm->name.jobid; + /* recover the number of procs in this job */ + vptr = &num_procs; + proc.vpid = ORTE_VPID_INVALID; + if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_NPROCS, (void**)&vptr, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; } - for (k=0; k < jmap->pmap.size; k++) { - if (NULL == (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, k))) { + + for (i=0; i < num_procs; i++) { + if (ORTE_PROC_MY_NAME->vpid == i && + ORTE_PROC_MY_NAME->jobid == proc.jobid) { + /* this is me */ continue; } - if (ORTE_PROC_MY_NAME->vpid == (orte_vpid_t)k && - jmap->job == ORTE_PROC_MY_NAME->jobid) { - /* this is me */ - pmap->locality = OPAL_PROC_ALL_LOCAL; - } else if (pmap->node == orte_process_info.my_node) { + proc.vpid = i; + /* recover the daemon for this proc */ + vptr = &daemon; + if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_DAEMON_VPID, (void**)&vptr, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + if (daemon == ORTE_PROC_MY_DAEMON->vpid) { #if OPAL_HAVE_HWLOC + /* retrieve the bind level for the other proc's job */ + lvptr = &pbind; + proc.vpid = ORTE_VPID_INVALID; + if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_LEVEL, (void**)&lvptr, OPAL_HWLOC_LEVEL_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + + /* retrieve the other's proc's bind idx */ + uiptr = &pbidx; + proc.vpid = i; + if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_INDEX, (void**)&uiptr, OPAL_UINT))) { + ORTE_ERROR_LOG(rc); + goto cleanup; + } + /* we share a node - see what else we share */ - pmap->locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, - orte_process_info.bind_level, - orte_process_info.bind_idx, - jmap->bind_level, - pmap->bind_idx); + locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, + orte_process_info.bind_level, + orte_process_info.bind_idx, + pbind, pbidx); #else - pmap->locality = OPAL_PROC_ON_NODE; + locality = OPAL_PROC_ON_NODE; #endif } else { - pmap->locality = OPAL_PROC_NON_LOCAL; + /* we don't share a node */ + locality = OPAL_PROC_NON_LOCAL; + } + /* store the locality */ + if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { + ORTE_ERROR_LOG(rc); + goto cleanup; } } } cleanup: - if (NULL != nodes) { - free(nodes); + if (NULL != daemons) { + free(daemons); } if (NULL != local_rank) { free(local_rank); @@ -1120,7 +988,7 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) { orte_jobid_t jobid; orte_vpid_t i, num_procs; - int32_t *nodes=NULL; + orte_vpid_t *nodes=NULL; orte_local_rank_t *local_rank=NULL; orte_node_rank_t *node_rank=NULL; #if OPAL_HAVE_HWLOC @@ -1175,10 +1043,10 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) #endif /* allocate memory for the node info */ - nodes = (int32_t*)malloc(num_procs * 4); + nodes = (orte_vpid_t*)malloc(num_procs * 4); /* unpack it in one shot */ n=num_procs; - if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, nodes, &n, OPAL_INT32))) { + if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, nodes, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } @@ -1370,152 +1238,3 @@ int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo) OBJ_DESTRUCT(&buf); return rc; } - -/*** NIDMAP UTILITIES ***/ -orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job) -{ - int i; - orte_jmap_t *jmap; - - /* unfortunately, job objects cannot be stored - * by index number as the jobid is a constructed - * value. So we have no choice but to cycle through - * the jobmap pointer array and look for the entry - * we want. We also cannot trust that the array is - * left-justified as cleanup is done - and array - * entries set to NULL - upon job completion. - */ - for (i=0; i < orte_jobmap.size; i++) { - if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) { - continue; - } - OPAL_OUTPUT_VERBOSE((10, orte_debug_output, - "%s lookup:pmap: checking job %s for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jmap->job), ORTE_JOBID_PRINT(job))); - if (job == jmap->job) { - return jmap; - } - } - - /* if we didn't find it, return NULL */ - return NULL; -} - -orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc) -{ - orte_jmap_t *jmap; - - if (NULL == (jmap = orte_util_lookup_jmap(proc->jobid))) { - return NULL; - } - - /* the get_item function will check the array index range, - * so we can just access it here - */ - return (orte_pmap_t *) opal_pointer_array_get_item(&jmap->pmap, proc->vpid); -} - -/* the daemon's vpid does not necessarily correlate - * to the node's index in the node array since - * some nodes may not have a daemon on them. Thus, - * we have to search for the daemon in the array. - * Fortunately, this is rarely done - */ -static orte_nid_t* find_daemon_node(orte_process_name_t *proc) -{ - int32_t i; - orte_nid_t *nid; - - for (i=0; i < orte_nidmap.size; i++) { - if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - continue; - } - OPAL_OUTPUT_VERBOSE((10, orte_debug_output, - "%s find:daemon:node: checking daemon %s for %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_VPID_PRINT(nid->daemon), ORTE_VPID_PRINT(proc->vpid))); - if (nid->daemon == proc->vpid) { - return nid; - } - } - - /* if we didn't find it, return NULL */ - return NULL; -} - -orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc) -{ - orte_pmap_t *pmap; - - OPAL_OUTPUT_VERBOSE((10, orte_debug_output, - "%s lookup:nid: looking for proc %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc))); - - if (ORTE_JOBID_IS_DAEMON(proc->jobid)) { - /* looking for a daemon */ - return find_daemon_node(proc); - } - - /* looking for an application proc */ - if (NULL == (pmap = orte_util_lookup_pmap(proc))) { - return NULL; - } - - /* the get_item function will check the array index range, - * so we can just access it here - */ - return (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, pmap->node); -} - -void orte_nidmap_dump(void) -{ - int i; - orte_nid_t *nid; - - opal_output(orte_clean_output, "*** DUMP OF NIDMAP ***"); - for (i=0; i < orte_nidmap.size; i++) { - if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) { - continue; - } - opal_output(orte_clean_output, "%s node[%d].name %s daemon %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i, - (NULL == nid->name) ? "NULL" : nid->name, - ORTE_VPID_PRINT(nid->daemon)); - } - opal_output(orte_clean_output, "\n\n"); -} - -void orte_jmap_dump(orte_jmap_t *jmap) -{ - int i; - orte_pmap_t *pmap; - - opal_output(orte_clean_output, "**** DUMP OF JOB %s (%s procs) ***", - ORTE_JOBID_PRINT(jmap->job), ORTE_VPID_PRINT(jmap->num_procs)); - - for (i=0; i < jmap->pmap.size; i++) { - if (NULL == (pmap = (orte_pmap_t*)opal_pointer_array_get_item(&jmap->pmap, i))) { - continue; - } - opal_output(orte_clean_output, "\tnode %d local_rank %d node_rank %d", - pmap->node, (int)pmap->local_rank, (int)pmap->node_rank); - } - opal_output(orte_clean_output, "\n"); -} - -void orte_jobmap_dump(void) -{ - int i; - orte_jmap_t *jmap; - - opal_output(orte_clean_output, "*** DUMP OF JOBMAP ***"); - for (i=0; i < orte_jobmap.size; i++) { - if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) { - continue; - } - orte_jmap_dump(jmap); - } - opal_output(orte_clean_output, "\n\n"); -} diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 0b612a92a9..8e10b44f2d 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -42,13 +42,6 @@ BEGIN_C_DECLS ORTE_DECLSPEC int orte_util_nidmap_init(opal_buffer_t *buffer); ORTE_DECLSPEC void orte_util_nidmap_finalize(void); -ORTE_DECLSPEC int orte_util_setup_local_nidmap_entries(void); - -ORTE_DECLSPEC orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job); -ORTE_DECLSPEC orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc); -ORTE_DECLSPEC orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc); - -ORTE_DECLSPEC int orte_util_set_proc_state(orte_process_name_t *proc, orte_proc_state_t state); ORTE_DECLSPEC int orte_util_encode_nodemap(opal_byte_object_t *boptr); ORTE_DECLSPEC int orte_util_decode_nodemap(opal_byte_object_t *boptr); @@ -62,10 +55,6 @@ ORTE_DECLSPEC int orte_util_decode_daemon_pidmap(opal_byte_object_t *bo); ORTE_DECLSPEC int orte_util_build_daemon_nidmap(char **nodes); #endif -ORTE_DECLSPEC void orte_nidmap_dump(void); -ORTE_DECLSPEC void orte_jmap_dump(orte_jmap_t *jmap); -ORTE_DECLSPEC void orte_jobmap_dump(void); - END_C_DECLS #endif diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index 45e0d2615e..ce024a950f 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -58,6 +58,7 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .my_port = */ 0, /* .num_restarts = */ 0, /* .my_node_rank = */ ORTE_NODE_RANK_INVALID, + /* .my_local_rank = */ ORTE_LOCAL_RANK_INVALID, /* .tmpdir_base = */ NULL, /* .top_session_dir = */ NULL, /* .job_session_dir = */ NULL, @@ -69,7 +70,6 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { /* .bind_level = */ OPAL_HWLOC_NODE_LEVEL, /* .bind_idx = */ 0, #endif - /* .my_node = */ -1, /* .app_rank = */ -1, /* .peer_modex = */ -1, /* .peer_init_barrier = */ -1, diff --git a/orte/util/proc_info.h b/orte/util/proc_info.h index 57f8340023..00e2f10847 100644 --- a/orte/util/proc_info.h +++ b/orte/util/proc_info.h @@ -103,6 +103,7 @@ struct orte_proc_info_t { uint16_t my_port; /**< TCP port for out-of-band comm */ int32_t num_restarts; /**< number of times this proc has restarted */ orte_node_rank_t my_node_rank; /**< node rank */ + orte_local_rank_t my_local_rank; /**< local rank */ /* The session directory has the form * ///, where the prefix * can either be provided by the user via the @@ -121,8 +122,7 @@ struct orte_proc_info_t { opal_hwloc_level_t bind_level; unsigned int bind_idx; #endif - int32_t my_node; /**< index in the node array of the node I am on */ - int32_t app_rank; /**< rank within my app_context */ + int32_t app_rank; /**< rank within my app_context */ orte_grpcomm_coll_id_t peer_modex; /**< modex collective id */ orte_grpcomm_coll_id_t peer_init_barrier; /**< barrier id during init */ orte_grpcomm_coll_id_t peer_fini_barrier; /**< barrier id during finalize */