diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index e449eb323b..6378bf55dd 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -42,7 +42,7 @@ #include "ompi/runtime/mpiruntime.h" #include "ompi/runtime/params.h" -static opal_list_t ompi_proc_list; +opal_list_t ompi_proc_list; static opal_mutex_t ompi_proc_lock; static opal_hash_table_t ompi_proc_hash; diff --git a/ompi/proc/proc.h b/ompi/proc/proc.h index e165a38628..f35c02cf96 100644 --- a/ompi/proc/proc.h +++ b/ompi/proc/proc.h @@ -85,7 +85,7 @@ OBJ_CLASS_DECLARATION(ompi_proc_t); * Please use ompi_proc_local() instead. */ OMPI_DECLSPEC extern ompi_proc_t* ompi_proc_local_proc; - +OMPI_DECLSPEC extern opal_list_t ompi_proc_list; /* ******************************************************************** */ diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index e2fb025c55..274b09b0d1 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -660,7 +660,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, } SPML_VERBOSE(5, "rank %d ptl %d addr %p size %llu %s", - oshmem_proc_local_proc->super.proc_name.vpid, i, addr, (unsigned long long)size, + oshmem_proc_pe(oshmem_proc_local()), i, addr, (unsigned long long)size, mca_spml_base_mkey2str(&mkeys[i])); } diff --git a/oshmem/mca/spml/yoda/spml_yoda.c b/oshmem/mca/spml/yoda/spml_yoda.c index c069414d3e..e850fae7f4 100644 --- a/oshmem/mca/spml/yoda/spml_yoda.c +++ b/oshmem/mca/spml/yoda/spml_yoda.c @@ -452,7 +452,7 @@ sshmem_mkey_t *mca_spml_yoda_register(void* addr, SPML_VERBOSE(5, "rank %d btl %s va_base: 0x%p len: %d key %llx size %llu", - OSHMEM_PROC_VPID(oshmem_proc_local_proc), btl_type2str(ybtl->btl_type), + oshmem_proc_pe(oshmem_proc_local()), btl_type2str(ybtl->btl_type), mkeys[i].va_base, mkeys[i].len, (unsigned long long)mkeys[i].u.key, (unsigned long long)size); } *count = mca_spml_yoda.n_btls; diff --git a/oshmem/proc/proc.c b/oshmem/proc/proc.c index b971fdcf86..2446aa164d 100644 --- a/oshmem/proc/proc.c +++ b/oshmem/proc/proc.c @@ -33,518 +33,23 @@ #include "opal/util/arch.h" #include "opal/class/opal_list.h" -#include "ompi/proc/proc.h" -opal_convertor_t* oshmem_shmem_local_convertor = NULL; - -opal_list_t oshmem_proc_list = {{0}}; static opal_mutex_t oshmem_proc_lock; -oshmem_proc_t* oshmem_proc_local_proc = NULL; - -static void oshmem_proc_construct(oshmem_proc_t* proc); -static void oshmem_proc_destruct(oshmem_proc_t* proc); - -OBJ_CLASS_INSTANCE( oshmem_proc_t, - opal_list_item_t, - oshmem_proc_construct, - oshmem_proc_destruct); - -void oshmem_proc_construct(oshmem_proc_t* proc) -{ - memset(proc->proc_endpoints, 0, sizeof(proc->proc_endpoints)); - - /* By default all processors are supposedly having the same architecture as me. Thus, - * by default we run in a homogeneous environment. Later, when the RTE can tell us - * the arch of the remote nodes, we will have to set the convertors to the correct - * architecture. - */ - proc->super.proc_arch = opal_local_arch; - proc->super.proc_convertor = oshmem_shmem_local_convertor; - OBJ_RETAIN( oshmem_shmem_local_convertor); - - proc->super.proc_flags = 0; - proc->num_transports = 0; - - /* initialize this pointer to NULL */ - proc->super.proc_hostname = NULL; -} - -void oshmem_proc_destruct(oshmem_proc_t* proc) -{ - /* As all the convertors are created with OBJ_NEW we can just call OBJ_RELEASE. All, except - * the local convertor, will get destroyed at some point here. If the reference count is correct - * the local convertor (who has the reference count increased in the datatype) will not get - * destroyed here. It will be destroyed later when the ompi_datatype_finalize is called. - */ - OBJ_RELEASE(proc->super.proc_convertor); - - /* DO NOT FREE THE HOSTNAME FIELD AS THIS POINTS - * TO AN AREA ALLOCATED/FREE'D ELSEWHERE - */ - OPAL_THREAD_LOCK(&oshmem_proc_lock); - opal_list_remove_item(&oshmem_proc_list, (opal_list_item_t*) proc); - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); -} int oshmem_proc_init(void) { - orte_vpid_t i; - - OBJ_CONSTRUCT(&oshmem_proc_list, opal_list_t); OBJ_CONSTRUCT(&oshmem_proc_lock, opal_mutex_t); - oshmem_shmem_local_convertor = opal_convertor_create(opal_local_arch, 0); - - size_t ompi_num_procs; - ompi_proc_t **ompi_procs = ompi_proc_world(&ompi_num_procs); - /* create proc structures and find self */ - for (i = 0; i < orte_process_info.num_procs; i++) { - oshmem_proc_t *proc = OBJ_NEW(oshmem_proc_t); - opal_list_append(&oshmem_proc_list, (opal_list_item_t*)proc); - - proc->super.proc_name = ompi_procs[i]->super.proc_name; - proc->super.proc_arch = ompi_procs[i]->super.proc_arch; - proc->super.proc_flags = ompi_procs[i]->super.proc_flags; - proc->super.proc_hostname = ompi_procs[i]->super.proc_hostname; - - if (i == ORTE_PROC_MY_NAME->vpid) { - oshmem_proc_local_proc = proc; - } - } - - if (ompi_procs) - free(ompi_procs); return OSHMEM_SUCCESS; } -/* in some cases, all PE procs are required to do a modex so they - * can (at the least) exchange their architecture. Since we cannot - * know in advance if this was required, we provide a separate function - * to set the arch (instead of doing it inside of oshmem_proc_init) that - * can be called after the modex completes in oshmem_shmem_init. Thus, we - * know that - regardless of how the arch is known, whether via modex - * or dropped in from a local daemon - the arch can be set correctly - * at this time - */ -int oshmem_proc_set_arch(void) -{ - oshmem_proc_t *proc = NULL; - opal_list_item_t *item = NULL; - int ret = OSHMEM_SUCCESS; - - OPAL_THREAD_LOCK(&oshmem_proc_lock); - - for (item = opal_list_get_first(&oshmem_proc_list); - item != opal_list_get_end(&oshmem_proc_list); - item = opal_list_get_next(item)) { - proc = (oshmem_proc_t*) item; - - if (OSHMEM_PROC_VPID(proc) != ORTE_PROC_MY_NAME->vpid) { - /* if arch is different than mine, create a new convertor for this proc */ - if (proc->super.proc_arch != opal_local_arch) { -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - OBJ_RELEASE(proc->super.proc_convertor); - proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0); -#else - orte_show_help("help-shmem-runtime.txt", - "heterogeneous-support-unavailable", - true, - orte_process_info.nodename, - proc->super.proc_hostname == NULL ? - "" : - proc->super.proc_hostname); - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - return OSHMEM_ERR_NOT_SUPPORTED; -#endif - } - } - } - - /* Set predefined groups */ - ret = oshmem_proc_group_init(); - - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - - return ret; -} - int oshmem_proc_finalize(void) { - opal_list_item_t *item; - - /* Destroy all groups */ - oshmem_proc_group_finalize(); - - /* remove all items from list and destroy them. Since we cannot know - * the reference count of the procs for certain, it is possible that - * a single OBJ_RELEASE won't drive the count to zero, and hence will - * not release the memory. Accordingly, we cycle through the list here, - * calling release on each item. - * - * This will cycle until it forces the reference count of each item - * to zero, thus causing the destructor to run - which will remove - * the item from the list! - * - * We cannot do this under the thread lock as the destructor will - * call it when removing the item from the list. However, this function - * is ONLY called from MPI_Finalize, and all threads are prohibited from - * calling an MPI function once ANY thread has called MPI_Finalize. Of - * course, multiple threads are allowed to call MPI_Finalize, so this - * function may get called multiple times by various threads. We believe - * it is thread safe to do so...though it may not -appear- to be so - * without walking through the entire list/destructor sequence. - */ - while (opal_list_get_end(&oshmem_proc_list) - != (item = opal_list_get_first(&oshmem_proc_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE( oshmem_shmem_local_convertor); - /* now destruct the list and thread lock */ - OBJ_DESTRUCT(&oshmem_proc_list); OBJ_DESTRUCT(&oshmem_proc_lock); return OSHMEM_SUCCESS; } -oshmem_proc_t** oshmem_proc_world(size_t *size) -{ - oshmem_proc_t **procs; - oshmem_proc_t *proc; - size_t count = 0; - orte_ns_cmp_bitmask_t mask; - orte_process_name_t my_name; - - /* check bozo case */ - if (NULL == oshmem_proc_local_proc) { - return NULL ; - } - mask = ORTE_NS_CMP_JOBID; - my_name = *(orte_process_name_t*)&oshmem_proc_local_proc->super.proc_name; - - /* First count how many match this jobid */ - OPAL_THREAD_LOCK(&oshmem_proc_lock); - for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); - proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); - proc = (oshmem_proc_t*) opal_list_get_next(proc)) { - if (OPAL_EQUAL - == orte_util_compare_name_fields(mask, - (orte_process_name_t*)&proc->super.proc_name, - &my_name)) { - ++count; - } - } - - /* allocate an array */ - procs = (oshmem_proc_t**) malloc(count * sizeof(oshmem_proc_t*)); - if (NULL == procs) { - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - return NULL ; - } - - /* now save only the procs that match this jobid */ - count = 0; - for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); - proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); - proc = (oshmem_proc_t*) opal_list_get_next(proc)) { - if (OPAL_EQUAL - == orte_util_compare_name_fields(mask, - (orte_process_name_t*)&proc->super.proc_name, - &my_name)) { - /* DO NOT RETAIN THIS OBJECT - the reference count on this - * object will be adjusted by external callers. The intent - * here is to allow the reference count to drop to zero if - * the app no longer desires to communicate with this proc. - * For example, the proc may call comm_disconnect on all - * communicators involving this proc. In such cases, we want - * the proc object to be removed from the list. By not incrementing - * the reference count here, we allow this to occur. - * - * We don't implement that yet, but we are still safe for now as - * the OBJ_NEW in oshmem_proc_init owns the initial reference - * count which cannot be released until oshmem_proc_finalize is - * called. - */ - procs[count++] = proc; - } - } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - - *size = count; - return procs; -} - -oshmem_proc_t** oshmem_proc_all(size_t* size) -{ - oshmem_proc_t **procs = - (oshmem_proc_t**) malloc(opal_list_get_size(&oshmem_proc_list) - * sizeof(oshmem_proc_t*)); - oshmem_proc_t *proc; - size_t count = 0; - - if (NULL == procs) { - return NULL ; - } - - OPAL_THREAD_LOCK(&oshmem_proc_lock); - for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); - proc && (proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list)); - proc = (oshmem_proc_t*)opal_list_get_next(proc)) { - /* We know this isn't consistent with the behavior in oshmem_proc_world, - * but we are leaving the RETAIN for now because the code using this function - * assumes that the results need to be released when done. It will - * be cleaned up later as the "fix" will impact other places in - * the code - */ - OBJ_RETAIN(proc); - procs[count++] = proc; - } - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - - *size = count; - - return procs; -} - -oshmem_proc_t** oshmem_proc_self(size_t* size) -{ - oshmem_proc_t **procs = (oshmem_proc_t**) malloc(sizeof(oshmem_proc_t*)); - if (NULL == procs) { - return NULL ; - } - /* We know this isn't consistent with the behavior in oshmem_proc_world, - * but we are leaving the RETAIN for now because the code using this function - * assumes that the results need to be released when done. It will - * be cleaned up later as the "fix" will impact other places in - * the code - */ - OBJ_RETAIN(oshmem_proc_local_proc); - - *procs = oshmem_proc_local_proc; - *size = 1; - return procs; -} - -oshmem_proc_t * oshmem_proc_find(const orte_process_name_t * name) -{ - oshmem_proc_t *proc, *rproc = NULL; - orte_ns_cmp_bitmask_t mask; - - /* return the proc-struct which matches this jobid+process id */ - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - OPAL_THREAD_LOCK(&oshmem_proc_lock); - for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); - proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); - proc = (oshmem_proc_t*) opal_list_get_next(proc)) { - if (OPAL_EQUAL - == orte_util_compare_name_fields(mask, - (orte_process_name_t*)&proc->super.proc_name, - name)) { - rproc = proc; - break; - } - } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - - return rproc; -} - - -int oshmem_proc_pack(oshmem_proc_t **proclist, - int proclistsize, - opal_buffer_t* buf) -{ - int i, rc; - - OPAL_THREAD_LOCK(&oshmem_proc_lock); - - /* cycle through the provided array, packing the OSHMEM level - * data for each proc. This data may or may not be included - * in any subsequent modex operation, so we include it here - * to ensure completion of a connect/accept handshake. See - * the ompi/mca/dpm framework for an example of where and how - * this info is used. - * - * Eventually, we will review the procedures that call this - * function to see if duplication of communication can be - * reduced. For now, just go ahead and pack the info so it - * can be sent. - */ - for (i = 0; i < proclistsize; i++) { - rc = opal_dss.pack(buf, &(proclist[i]->super.proc_name), 1, ORTE_NAME); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - return rc; - } - rc = opal_dss.pack(buf, &(proclist[i]->super.proc_arch), 1, OPAL_UINT32); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - return rc; - } - rc = opal_dss.pack(buf, &(proclist[i]->super.proc_hostname), 1, OPAL_STRING); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - return rc; - } - } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - return OSHMEM_SUCCESS; -} - -static oshmem_proc_t * -oshmem_proc_find_and_add(const orte_process_name_t * name, bool* isnew) -{ - oshmem_proc_t *proc, *rproc = NULL; - orte_ns_cmp_bitmask_t mask; - - /* return the proc-struct which matches this jobid+process id */ - mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; - OPAL_THREAD_LOCK(&oshmem_proc_lock); - for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); - proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); - proc = (oshmem_proc_t*) opal_list_get_next(proc)) { - if (OPAL_EQUAL - == orte_util_compare_name_fields(mask, - (orte_process_name_t*)&proc->super.proc_name, - name)) { - rproc = proc; - *isnew = false; - break; - } - } - - /* if we didn't find this proc in the list, create a new - * proc_t and append it to the list - */ - if (NULL == rproc) { - *isnew = true; - rproc = OBJ_NEW(oshmem_proc_t); - if (NULL != rproc) { - opal_list_append(&oshmem_proc_list, (opal_list_item_t*)rproc); - rproc->super.proc_name = *(opal_process_name_t*)name; - } - /* caller had better fill in the rest of the proc, or there's - going to be pain later... */ - } - - OPAL_THREAD_UNLOCK(&oshmem_proc_lock); - - return rproc; -} - -int oshmem_proc_unpack(opal_buffer_t* buf, - int proclistsize, - oshmem_proc_t ***proclist, - int *newproclistsize, - oshmem_proc_t ***newproclist) -{ - int i; - size_t newprocs_len = 0; - oshmem_proc_t **plist = NULL, **newprocs = NULL; - - /* do not free plist *ever*, since it is used in the remote group - structure of a communicator */ - plist = (oshmem_proc_t **) calloc(proclistsize, sizeof(oshmem_proc_t *)); - if (NULL == plist) { - return OSHMEM_ERR_OUT_OF_RESOURCE; - } - /* free this on the way out */ - newprocs = (oshmem_proc_t **) calloc(proclistsize, sizeof(oshmem_proc_t *)); - if (NULL == newprocs) { - free(plist); - return OSHMEM_ERR_OUT_OF_RESOURCE; - } - - /* cycle through the array of provided procs and unpack - * their info - as packed by oshmem_proc_pack - */ - for (i = 0; i < proclistsize; i++) { - orte_std_cntr_t count = 1; - orte_process_name_t new_name; - uint32_t new_arch; - char *new_hostname; - bool isnew = false; - int rc; - - rc = opal_dss.unpack(buf, &new_name, &count, ORTE_NAME); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - free(plist); - free(newprocs); - return rc; - } - rc = opal_dss.unpack(buf, &new_arch, &count, OPAL_UINT32); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - free(plist); - free(newprocs); - return rc; - } - rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING); - if (rc != ORTE_SUCCESS) { - ORTE_ERROR_LOG(rc); - free(plist); - free(newprocs); - return rc; - } - - /* see if this proc is already on our oshmem_proc_list */ - plist[i] = oshmem_proc_find_and_add(&new_name, &isnew); - if (isnew) { - /* if not, then it was added, so update the values - * in the proc_t struct with the info that was passed - * to us - */ - newprocs[newprocs_len++] = plist[i]; - - /* update all the values */ - plist[i]->super.proc_arch = new_arch; - /* if arch is different than mine, create a new convertor for this proc */ - if (plist[i]->super.proc_arch != opal_local_arch) { -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - OBJ_RELEASE(plist[i]->super.proc_convertor); - plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0); -#else - orte_show_help("help-shmem-runtime.txt", - "heterogeneous-support-unavailable", - true, - orte_process_info.nodename, - new_hostname == NULL ? "" : - new_hostname); - free(plist); - free(newprocs); - return OSHMEM_ERR_NOT_SUPPORTED; -#endif - } - if (0 - == strcmp(oshmem_proc_local_proc->super.proc_hostname, - new_hostname)) { - plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU - | OPAL_PROC_ON_CLUSTER); - } - - /* Save the hostname */ - plist[i]->super.proc_hostname = new_hostname; - - /* eventually, we will update the orte/mca/ess framework's data - * to contain the info for the new proc. For now, we ignore - * this step since the MPI layer already has all the info - * it requires - */ - } - } - - if (NULL != newproclistsize) - *newproclistsize = newprocs_len; - if (NULL != newproclist) { - *newproclist = newprocs; - } else if (newprocs != NULL ) { - free(newprocs); - } - - *proclist = plist; - return OSHMEM_SUCCESS; -} - opal_pointer_array_t oshmem_group_array = {{0}}; oshmem_group_t* oshmem_group_all = NULL; @@ -553,7 +58,7 @@ oshmem_group_t* oshmem_group_null = NULL; OBJ_CLASS_INSTANCE(oshmem_group_t, opal_object_t, NULL, NULL); -OSHMEM_DECLSPEC int oshmem_proc_group_init(void) +int oshmem_proc_group_init(void) { /* Setup communicator array */ @@ -571,14 +76,14 @@ OSHMEM_DECLSPEC int oshmem_proc_group_init(void) == (oshmem_group_all = oshmem_proc_group_create(0, 1, - opal_list_get_size(&oshmem_proc_list)))) { + opal_list_get_size(&ompi_proc_list)))) { oshmem_proc_group_destroy(oshmem_group_all); return OSHMEM_ERROR; } /* Setup SHMEM_GROUP_SELF */ if (NULL - == (oshmem_group_self = oshmem_proc_group_create(OSHMEM_PROC_VPID(oshmem_proc_local()), + == (oshmem_group_self = oshmem_proc_group_create(oshmem_proc_pe(oshmem_proc_local()), 0, 1))) { oshmem_proc_group_destroy(oshmem_group_self); @@ -591,7 +96,7 @@ OSHMEM_DECLSPEC int oshmem_proc_group_init(void) return OSHMEM_SUCCESS; } -OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void) +int oshmem_proc_group_finalize(void) { int max, i; oshmem_group_t *group; @@ -613,9 +118,9 @@ OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void) return OSHMEM_SUCCESS; } -OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, - int pe_stride, - size_t pe_size) +oshmem_group_t* oshmem_proc_group_create(int pe_start, + int pe_stride, + size_t pe_size) { int cur_pe, count_pe; int i; @@ -638,11 +143,11 @@ OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, return NULL ; } - group->my_pe = OSHMEM_PROC_VPID(oshmem_proc_local()); + group->my_pe = oshmem_proc_pe(oshmem_proc_local()); group->is_member = 0; /* now save only the procs that match this jobid */ - for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); - proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); + for (proc = (oshmem_proc_t*) opal_list_get_first(&ompi_proc_list); + proc != (oshmem_proc_t*) opal_list_get_end(&ompi_proc_list); proc = (oshmem_proc_t*) opal_list_get_next(proc)) { if (count_pe >= (int) pe_size) { break; @@ -687,7 +192,7 @@ OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, return group; } -OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group) +void oshmem_proc_group_destroy(oshmem_group_t* group) { if (group) { mca_scoll_base_group_unselect(group); diff --git a/oshmem/proc/proc.h b/oshmem/proc/proc.h index 688e3dcc1a..e9c6ee695e 100644 --- a/oshmem/proc/proc.h +++ b/oshmem/proc/proc.h @@ -24,6 +24,7 @@ #include "orte/types.h" #include "orte/runtime/orte_globals.h" +#include "ompi/proc/proc.h" #include "ompi/communicator/communicator.h" BEGIN_C_DECLS @@ -85,18 +86,6 @@ OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_all; OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_self; OSHMEM_DECLSPEC extern oshmem_group_t* oshmem_group_null; -/** - * @private - * - * Pointer to the oshmem_proc_t structure for the local process - * - * Pointer to the oshmem_proc_t structure for the local process. - * - * @note This pointer is declared here to allow inline functions - * within this header file to access the local process quickly. - * Please use oshmem_proc_local() instead. - */ -OSHMEM_DECLSPEC extern oshmem_proc_t* oshmem_proc_local_proc; /* ******************************************************************** */ @@ -120,21 +109,6 @@ OSHMEM_DECLSPEC extern oshmem_proc_t* oshmem_proc_local_proc; */ OSHMEM_DECLSPEC int oshmem_proc_init(void); -/** - * Set the arch of each proc in the oshmem_proc_list - * - * In some environments, SHMEM procs are required to exchange their - * arch via a modex operation during mpi_init. In other environments, - * the arch is determined by other mechanisms and provided to the - * proc directly. To support both mechanisms, we provide a separate - * function to set the arch of the procs -after- the modex operation - * has completed in mpi_init. - * - * @retval OSHMEM_SUCCESS Archs successfully set - * @retval OSHMEM_ERROR Archs could not be initialized - */ -OSHMEM_DECLSPEC int oshmem_proc_set_arch(void); - /** * Finalize the OSHMEM Process subsystem * @@ -146,64 +120,6 @@ OSHMEM_DECLSPEC int oshmem_proc_set_arch(void); */ OSHMEM_DECLSPEC int oshmem_proc_finalize(void); -/** - * Returns the list of proc instances associated with this job. - * - * Returns the list of proc instances associated with this job. Given - * the current association between a job and an pe set, this - * function provides the process instances for the current - * pe set. - * - * @note The reference count of each process in the array is - * NOT incremented - the caller is responsible for ensuring the - * correctness of the reference count once they are done with - * the array. - * - * @param[in] size Number of processes in the oshmem_proc_t array - * - * @return Array of pointers to proc instances in the current - * pe set, or NULL if there is an internal failure. - */ -OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_world(size_t* size); - -/** - * Returns the list of all known proc instances. - * - * Returns the list of all known proc instances, including those in - * other pe sets. It is possible that we may no longer be - * connected to some of the procs returned (in the SHMEM sense of the - * word connected). In a strictly SHMEM-1 application, this function - * will return the same information as oshmem_proc_world(). - * - * @note The reference count of each process in the array is - * incremented and the caller is responsible for releasing each - * process in the array, as well as freeing the array. - * - * @param[in] size Number of processes in the oshmem_proc_t array - * - * @return Array of pointers to proc instances in the current - * known universe, or NULL if there is an internal failure. - */ -OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_all(size_t* size); - -/** - * Returns a list of the local process - * - * Returns a list containing the local process (and only the local - * process). Has calling semantics similar to oshmem_proc_world() and - * oshmem_proc_all(). - * - * @note The reference count of each process in the array is - * incremented and the caller is responsible for releasing each - * process in the array, as well as freeing the array. - * - * @param[in] size Number of processes in the oshmem_proc_t array - * - * @return Array of pointers to proc instances in the current - * known universe, or NULL if there is an internal failure. - */ -OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_self(size_t* size); - /** * Returns a pointer to the local process * @@ -215,7 +131,7 @@ OSHMEM_DECLSPEC oshmem_proc_t** oshmem_proc_self(size_t* size); */ static inline oshmem_proc_t* oshmem_proc_local(void) { - return oshmem_proc_local_proc; + return (oshmem_proc_t *)ompi_proc_local_proc; } /** @@ -229,75 +145,10 @@ static inline oshmem_proc_t* oshmem_proc_local(void) * * @return Pointer to the process instance for \c name */ -OSHMEM_DECLSPEC oshmem_proc_t * oshmem_proc_find(const orte_process_name_t* name); - -/** - * Pack proc list into portable buffer - * - * This function takes a list of oshmem_proc_t pointers (e.g. as given - * in groups) and returns a orte buffer containing all information - * needed to add the proc to a remote list. This includes the ORTE - * process name, the architecture, and the hostname. Ordering is - * maintained. The buffer is packed to be sent to a remote node with - * different architecture (endian or word size). The buffer can be - * dss unloaded to be sent using SHMEM or send using rml_send_packed(). - * - * @param[in] proclist List of process pointers - * @param[in] proclistsize Length of the proclist array - * @param[in,out] buf An orte_buffer containing the packed names. - * The buffer must be constructed but empty when - * passed to this function - * @retval OSHMEM_SUCCESS Success - * @retval OSHMEM_ERROR Unspecified error - */ -OSHMEM_DECLSPEC int oshmem_proc_pack(oshmem_proc_t **proclist, - int proclistsize, - opal_buffer_t *buf); - -/** - * Unpack a portable buffer of procs - * - * This function unpacks a packed list of oshmem_proc_t structures and - * returns the ordered list of proc structures. If the given proc is - * already "known", the architecture and hostname information in the - * buffer is ignored. If the proc is "new" to this process, it will - * be added to the global list of known procs, with information - * provided in the buffer. The lookup actions are always entirely - * local. The proclist returned is a list of pointers to all procs in - * the buffer, whether they were previously known or are new to this - * process. - * - * @note In previous versions of this function, The PML's add_procs() - * function was called for any new processes discovered as a result of - * this operation. That is no longer the case -- the caller must use - * the newproclist information to call add_procs() if necessary. - * - * @note The reference count for procs created as a result of this - * operation will be set to 1. Existing procs will not have their - * reference count changed. The reference count of a proc at the - * return of this function is the same regardless of whether NULL is - * provided for newproclist. The user is responsible for freeing the - * newproclist array. - * - * @param[in] buf orte_buffer containing the packed names - * @param[in] proclistsize number of expected proc-pointres - * @param[out] proclist list of process pointers - * @param[out] newproclistsize Number of new procs added as a result - * of the unpack operation. NULL may be - * provided if information is not needed. - * @param[out] newproclist List of new procs added as a result of - * the unpack operation. NULL may be - * provided if informationis not needed. - * - * Return value: - * OSHMEM_SUCCESS on success - * OSHMEM_ERROR else - */ -OSHMEM_DECLSPEC int oshmem_proc_unpack(opal_buffer_t *buf, - int proclistsize, - oshmem_proc_t ***proclist, - int *newproclistsize, - oshmem_proc_t ***newproclist); +static inline oshmem_proc_t * oshmem_proc_find(const orte_process_name_t* name) +{ + return (oshmem_proc_t *)ompi_proc_find(name); +} static inline int oshmem_proc_pe(oshmem_proc_t *proc) { @@ -420,10 +271,8 @@ static inline int oshmem_proc_group_is_member(oshmem_group_t *group) static inline int oshmem_num_procs(void) { - extern opal_list_t oshmem_proc_list; - if (!oshmem_group_all) - return opal_list_get_size(&oshmem_proc_list); + return opal_list_get_size(&ompi_proc_list); return oshmem_group_all->proc_count; } diff --git a/oshmem/runtime/oshmem_shmem_finalize.c b/oshmem/runtime/oshmem_shmem_finalize.c index 01292698f6..761bde32a0 100644 --- a/oshmem/runtime/oshmem_shmem_finalize.c +++ b/oshmem/runtime/oshmem_shmem_finalize.c @@ -148,6 +148,11 @@ static int _shmem_finalize(void) return ret; } + /* free proc_group resources */ + if (OSHMEM_SUCCESS != (ret = oshmem_proc_group_finalize())) { + return ret; + } + /* free proc resources */ if (OSHMEM_SUCCESS != (ret = oshmem_proc_finalize())) { return ret; diff --git a/oshmem/runtime/oshmem_shmem_init.c b/oshmem/runtime/oshmem_shmem_init.c index b7937dd649..895911b18e 100644 --- a/oshmem/runtime/oshmem_shmem_init.c +++ b/oshmem/runtime/oshmem_shmem_init.c @@ -238,11 +238,6 @@ static int _shmem_init(int argc, char **argv, int requested, int *provided) goto error; } - /* We need to do this anyway. - * This place requires to be reviewed and more elegant way is expected - */ - ompi_proc_local_proc = (ompi_proc_t*) oshmem_proc_local_proc; - /* Register the OSHMEM layer's MCA parameters */ if (OSHMEM_SUCCESS != (ret = oshmem_shmem_register_params())) { error = "oshmem_info_register: oshmem_register_params failed"; @@ -297,11 +292,8 @@ static int _shmem_init(int argc, char **argv, int requested, int *provided) goto error; } - /* identify the architectures of remote procs and setup - * their datatype convertors, if required - */ - if (OSHMEM_SUCCESS != (ret = oshmem_proc_set_arch())) { - error = "oshmem_proc_set_arch failed"; + if (OSHMEM_SUCCESS != (ret = oshmem_proc_group_init())) { + error = "oshmem_proc_group_init() failed"; goto error; } @@ -312,7 +304,7 @@ static int _shmem_init(int argc, char **argv, int requested, int *provided) goto error; } - /* There is issue with call add_proc twice so + /* TODO: DO WE NEED IT? There is issue with call add_proc twice so * we need to use btl info got from PML add_procs() before call of SPML add_procs() */ {