/* * Copyright (c) 2013 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "oshmem_config.h" #include "oshmem/proc/proc.h" #include "oshmem/constants.h" #include "oshmem/runtime/runtime.h" #include "oshmem/mca/scoll/base/base.h" #ifdef HAVE_STRINGS_H #include #endif #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/ess.h" #include "orte/util/proc_info.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "opal/datatype/opal_convertor.h" #include "opal/threads/mutex.h" #include "opal/dss/dss.h" #include "opal/util/arch.h" #include "opal/class/opal_list.h" #include "ompi/proc/proc.h" opal_convertor_t* oshmem_shmem_local_convertor = NULL; opal_list_t oshmem_proc_list = {{0}}; static opal_mutex_t oshmem_proc_lock; oshmem_proc_t* oshmem_proc_local_proc = NULL; static void oshmem_proc_construct(oshmem_proc_t* proc); static void oshmem_proc_destruct(oshmem_proc_t* proc); OBJ_CLASS_INSTANCE( oshmem_proc_t, opal_list_item_t, oshmem_proc_construct, oshmem_proc_destruct); void oshmem_proc_construct(oshmem_proc_t* proc) { memset(proc->proc_endpoints, 0, sizeof(proc->proc_endpoints)); /* By default all processors are supposedly having the same architecture as me. Thus, * by default we run in a homogeneous environment. Later, when the RTE can tell us * the arch of the remote nodes, we will have to set the convertors to the correct * architecture. */ proc->super.proc_arch = opal_local_arch; proc->super.proc_convertor = oshmem_shmem_local_convertor; OBJ_RETAIN( oshmem_shmem_local_convertor); proc->super.proc_flags = 0; proc->num_transports = 0; /* initialize this pointer to NULL */ proc->super.proc_hostname = NULL; } void oshmem_proc_destruct(oshmem_proc_t* proc) { /* As all the convertors are created with OBJ_NEW we can just call OBJ_RELEASE. All, except * the local convertor, will get destroyed at some point here. If the reference count is correct * the local convertor (who has the reference count increased in the datatype) will not get * destroyed here. It will be destroyed later when the ompi_datatype_finalize is called. */ OBJ_RELEASE(proc->super.proc_convertor); /* DO NOT FREE THE HOSTNAME FIELD AS THIS POINTS * TO AN AREA ALLOCATED/FREE'D ELSEWHERE */ OPAL_THREAD_LOCK(&oshmem_proc_lock); opal_list_remove_item(&oshmem_proc_list, (opal_list_item_t*) proc); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); } int oshmem_proc_init(void) { orte_vpid_t i; OBJ_CONSTRUCT(&oshmem_proc_list, opal_list_t); OBJ_CONSTRUCT(&oshmem_proc_lock, opal_mutex_t); oshmem_shmem_local_convertor = opal_convertor_create(opal_local_arch, 0); size_t ompi_num_procs; ompi_proc_t **ompi_procs = ompi_proc_world(&ompi_num_procs); /* create proc structures and find self */ for (i = 0; i < orte_process_info.num_procs; i++) { oshmem_proc_t *proc = OBJ_NEW(oshmem_proc_t); opal_list_append(&oshmem_proc_list, (opal_list_item_t*)proc); proc->super.proc_name = ompi_procs[i]->super.proc_name; proc->super.proc_arch = ompi_procs[i]->super.proc_arch; proc->super.proc_flags = ompi_procs[i]->super.proc_flags; proc->super.proc_hostname = ompi_procs[i]->super.proc_hostname; if (i == ORTE_PROC_MY_NAME->vpid) { oshmem_proc_local_proc = proc; } } if (ompi_procs) free(ompi_procs); return OSHMEM_SUCCESS; } /* in some cases, all PE procs are required to do a modex so they * can (at the least) exchange their architecture. Since we cannot * know in advance if this was required, we provide a separate function * to set the arch (instead of doing it inside of oshmem_proc_init) that * can be called after the modex completes in oshmem_shmem_init. Thus, we * know that - regardless of how the arch is known, whether via modex * or dropped in from a local daemon - the arch can be set correctly * at this time */ int oshmem_proc_set_arch(void) { oshmem_proc_t *proc = NULL; opal_list_item_t *item = NULL; int ret = OSHMEM_SUCCESS; OPAL_THREAD_LOCK(&oshmem_proc_lock); for (item = opal_list_get_first(&oshmem_proc_list); item != opal_list_get_end(&oshmem_proc_list); item = opal_list_get_next(item)) { proc = (oshmem_proc_t*) item; if (OSHMEM_PROC_VPID(proc) != ORTE_PROC_MY_NAME->vpid) { /* if arch is different than mine, create a new convertor for this proc */ if (proc->super.proc_arch != opal_local_arch) { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT OBJ_RELEASE(proc->super.proc_convertor); proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0); #else orte_show_help("help-shmem-runtime.txt", "heterogeneous-support-unavailable", true, orte_process_info.nodename, proc->super.proc_hostname == NULL ? "" : proc->super.proc_hostname); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return OSHMEM_ERR_NOT_SUPPORTED; #endif } } } /* Set predefined groups */ ret = oshmem_proc_group_init(); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return ret; } int oshmem_proc_finalize(void) { opal_list_item_t *item; /* Destroy all groups */ oshmem_proc_group_finalize(); /* remove all items from list and destroy them. Since we cannot know * the reference count of the procs for certain, it is possible that * a single OBJ_RELEASE won't drive the count to zero, and hence will * not release the memory. Accordingly, we cycle through the list here, * calling release on each item. * * This will cycle until it forces the reference count of each item * to zero, thus causing the destructor to run - which will remove * the item from the list! * * We cannot do this under the thread lock as the destructor will * call it when removing the item from the list. However, this function * is ONLY called from MPI_Finalize, and all threads are prohibited from * calling an MPI function once ANY thread has called MPI_Finalize. Of * course, multiple threads are allowed to call MPI_Finalize, so this * function may get called multiple times by various threads. We believe * it is thread safe to do so...though it may not -appear- to be so * without walking through the entire list/destructor sequence. */ while (opal_list_get_end(&oshmem_proc_list) != (item = opal_list_get_first(&oshmem_proc_list))) { OBJ_RELEASE(item); } OBJ_RELEASE( oshmem_shmem_local_convertor); /* now destruct the list and thread lock */ OBJ_DESTRUCT(&oshmem_proc_list); OBJ_DESTRUCT(&oshmem_proc_lock); return OSHMEM_SUCCESS; } oshmem_proc_t** oshmem_proc_world(size_t *size) { oshmem_proc_t **procs; oshmem_proc_t *proc; size_t count = 0; orte_ns_cmp_bitmask_t mask; orte_process_name_t my_name; /* check bozo case */ if (NULL == oshmem_proc_local_proc) { return NULL ; } mask = ORTE_NS_CMP_JOBID; my_name = *(orte_process_name_t*)&oshmem_proc_local_proc->super.proc_name; /* First count how many match this jobid */ OPAL_THREAD_LOCK(&oshmem_proc_lock); for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); proc = (oshmem_proc_t*) opal_list_get_next(proc)) { if (OPAL_EQUAL == orte_util_compare_name_fields(mask, (orte_process_name_t*)&proc->super.proc_name, &my_name)) { ++count; } } /* allocate an array */ procs = (oshmem_proc_t**) malloc(count * sizeof(oshmem_proc_t*)); if (NULL == procs) { OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return NULL ; } /* now save only the procs that match this jobid */ count = 0; for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); proc = (oshmem_proc_t*) opal_list_get_next(proc)) { if (OPAL_EQUAL == orte_util_compare_name_fields(mask, (orte_process_name_t*)&proc->super.proc_name, &my_name)) { /* DO NOT RETAIN THIS OBJECT - the reference count on this * object will be adjusted by external callers. The intent * here is to allow the reference count to drop to zero if * the app no longer desires to communicate with this proc. * For example, the proc may call comm_disconnect on all * communicators involving this proc. In such cases, we want * the proc object to be removed from the list. By not incrementing * the reference count here, we allow this to occur. * * We don't implement that yet, but we are still safe for now as * the OBJ_NEW in oshmem_proc_init owns the initial reference * count which cannot be released until oshmem_proc_finalize is * called. */ procs[count++] = proc; } } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); *size = count; return procs; } oshmem_proc_t** oshmem_proc_all(size_t* size) { oshmem_proc_t **procs = (oshmem_proc_t**) malloc(opal_list_get_size(&oshmem_proc_list) * sizeof(oshmem_proc_t*)); oshmem_proc_t *proc; size_t count = 0; if (NULL == procs) { return NULL ; } OPAL_THREAD_LOCK(&oshmem_proc_lock); for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); proc && (proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list)); proc = (oshmem_proc_t*)opal_list_get_next(proc)) { /* We know this isn't consistent with the behavior in oshmem_proc_world, * but we are leaving the RETAIN for now because the code using this function * assumes that the results need to be released when done. It will * be cleaned up later as the "fix" will impact other places in * the code */ OBJ_RETAIN(proc); procs[count++] = proc; } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); *size = count; return procs; } oshmem_proc_t** oshmem_proc_self(size_t* size) { oshmem_proc_t **procs = (oshmem_proc_t**) malloc(sizeof(oshmem_proc_t*)); if (NULL == procs) { return NULL ; } /* We know this isn't consistent with the behavior in oshmem_proc_world, * but we are leaving the RETAIN for now because the code using this function * assumes that the results need to be released when done. It will * be cleaned up later as the "fix" will impact other places in * the code */ OBJ_RETAIN(oshmem_proc_local_proc); *procs = oshmem_proc_local_proc; *size = 1; return procs; } oshmem_proc_t * oshmem_proc_find(const orte_process_name_t * name) { oshmem_proc_t *proc, *rproc = NULL; orte_ns_cmp_bitmask_t mask; /* return the proc-struct which matches this jobid+process id */ mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; OPAL_THREAD_LOCK(&oshmem_proc_lock); for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); proc = (oshmem_proc_t*) opal_list_get_next(proc)) { if (OPAL_EQUAL == orte_util_compare_name_fields(mask, (orte_process_name_t*)&proc->super.proc_name, name)) { rproc = proc; break; } } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return rproc; } int oshmem_proc_pack(oshmem_proc_t **proclist, int proclistsize, opal_buffer_t* buf) { int i, rc; OPAL_THREAD_LOCK(&oshmem_proc_lock); /* cycle through the provided array, packing the OSHMEM level * data for each proc. This data may or may not be included * in any subsequent modex operation, so we include it here * to ensure completion of a connect/accept handshake. See * the ompi/mca/dpm framework for an example of where and how * this info is used. * * Eventually, we will review the procedures that call this * function to see if duplication of communication can be * reduced. For now, just go ahead and pack the info so it * can be sent. */ for (i = 0; i < proclistsize; i++) { rc = opal_dss.pack(buf, &(proclist[i]->super.proc_name), 1, ORTE_NAME); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return rc; } rc = opal_dss.pack(buf, &(proclist[i]->super.proc_arch), 1, OPAL_UINT32); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return rc; } rc = opal_dss.pack(buf, &(proclist[i]->super.proc_hostname), 1, OPAL_STRING); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return rc; } } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return OSHMEM_SUCCESS; } static oshmem_proc_t * oshmem_proc_find_and_add(const orte_process_name_t * name, bool* isnew) { oshmem_proc_t *proc, *rproc = NULL; orte_ns_cmp_bitmask_t mask; /* return the proc-struct which matches this jobid+process id */ mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID; OPAL_THREAD_LOCK(&oshmem_proc_lock); for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); proc = (oshmem_proc_t*) opal_list_get_next(proc)) { if (OPAL_EQUAL == orte_util_compare_name_fields(mask, (orte_process_name_t*)&proc->super.proc_name, name)) { rproc = proc; *isnew = false; break; } } /* if we didn't find this proc in the list, create a new * proc_t and append it to the list */ if (NULL == rproc) { *isnew = true; rproc = OBJ_NEW(oshmem_proc_t); if (NULL != rproc) { opal_list_append(&oshmem_proc_list, (opal_list_item_t*)rproc); rproc->super.proc_name = *(opal_process_name_t*)name; } /* caller had better fill in the rest of the proc, or there's going to be pain later... */ } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return rproc; } int oshmem_proc_unpack(opal_buffer_t* buf, int proclistsize, oshmem_proc_t ***proclist, int *newproclistsize, oshmem_proc_t ***newproclist) { int i; size_t newprocs_len = 0; oshmem_proc_t **plist = NULL, **newprocs = NULL; /* do not free plist *ever*, since it is used in the remote group structure of a communicator */ plist = (oshmem_proc_t **) calloc(proclistsize, sizeof(oshmem_proc_t *)); if (NULL == plist) { return OSHMEM_ERR_OUT_OF_RESOURCE; } /* free this on the way out */ newprocs = (oshmem_proc_t **) calloc(proclistsize, sizeof(oshmem_proc_t *)); if (NULL == newprocs) { free(plist); return OSHMEM_ERR_OUT_OF_RESOURCE; } /* cycle through the array of provided procs and unpack * their info - as packed by oshmem_proc_pack */ for (i = 0; i < proclistsize; i++) { orte_std_cntr_t count = 1; orte_process_name_t new_name; uint32_t new_arch; char *new_hostname; bool isnew = false; int rc; rc = opal_dss.unpack(buf, &new_name, &count, ORTE_NAME); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); free(plist); free(newprocs); return rc; } rc = opal_dss.unpack(buf, &new_arch, &count, OPAL_UINT32); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); free(plist); free(newprocs); return rc; } rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); free(plist); free(newprocs); return rc; } /* see if this proc is already on our oshmem_proc_list */ plist[i] = oshmem_proc_find_and_add(&new_name, &isnew); if (isnew) { /* if not, then it was added, so update the values * in the proc_t struct with the info that was passed * to us */ newprocs[newprocs_len++] = plist[i]; /* update all the values */ plist[i]->super.proc_arch = new_arch; /* if arch is different than mine, create a new convertor for this proc */ if (plist[i]->super.proc_arch != opal_local_arch) { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT OBJ_RELEASE(plist[i]->super.proc_convertor); plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0); #else orte_show_help("help-shmem-runtime.txt", "heterogeneous-support-unavailable", true, orte_process_info.nodename, new_hostname == NULL ? "" : new_hostname); free(plist); free(newprocs); return OSHMEM_ERR_NOT_SUPPORTED; #endif } if (0 == strcmp(oshmem_proc_local_proc->super.proc_hostname, new_hostname)) { plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER); } /* Save the hostname */ plist[i]->super.proc_hostname = new_hostname; /* eventually, we will update the orte/mca/ess framework's data * to contain the info for the new proc. For now, we ignore * this step since the MPI layer already has all the info * it requires */ } } if (NULL != newproclistsize) *newproclistsize = newprocs_len; if (NULL != newproclist) { *newproclist = newprocs; } else if (newprocs != NULL ) { free(newprocs); } *proclist = plist; return OSHMEM_SUCCESS; } opal_pointer_array_t oshmem_group_array = {{0}}; oshmem_group_t* oshmem_group_all = NULL; oshmem_group_t* oshmem_group_self = NULL; oshmem_group_t* oshmem_group_null = NULL; OBJ_CLASS_INSTANCE(oshmem_group_t, opal_object_t, NULL, NULL); OSHMEM_DECLSPEC int oshmem_proc_group_init(void) { /* Setup communicator array */ OBJ_CONSTRUCT(&oshmem_group_array, opal_pointer_array_t); if (OPAL_SUCCESS != opal_pointer_array_init(&oshmem_group_array, 0, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1)) { return OSHMEM_ERROR; } /* Setup SHMEM_GROUP_ALL */ if (NULL == (oshmem_group_all = oshmem_proc_group_create(0, 1, opal_list_get_size(&oshmem_proc_list)))) { oshmem_proc_group_destroy(oshmem_group_all); return OSHMEM_ERROR; } /* Setup SHMEM_GROUP_SELF */ if (NULL == (oshmem_group_self = oshmem_proc_group_create(OSHMEM_PROC_VPID(oshmem_proc_local()), 0, 1))) { oshmem_proc_group_destroy(oshmem_group_self); return OSHMEM_ERROR; } /* Setup SHMEM_GROUP_NULL */ oshmem_group_null = NULL; return OSHMEM_SUCCESS; } OSHMEM_DECLSPEC int oshmem_proc_group_finalize(void) { int max, i; oshmem_group_t *group; /* Check whether we have some left */ max = opal_pointer_array_get_size(&oshmem_group_array); for (i = 0; i < max; i++) { group = (oshmem_group_t *) opal_pointer_array_get_item(&oshmem_group_array, i); if (NULL != group) { /* Group has not been freed before finalize */ oshmem_proc_group_destroy(group); } } OBJ_DESTRUCT(&oshmem_group_array); return OSHMEM_SUCCESS; } OSHMEM_DECLSPEC oshmem_group_t* oshmem_proc_group_create(int pe_start, int pe_stride, size_t pe_size) { int cur_pe, count_pe; int i; oshmem_group_t* group = NULL; oshmem_proc_t** proc_array = NULL; oshmem_proc_t* proc = NULL; group = OBJ_NEW(oshmem_group_t); if (group) { cur_pe = 0; count_pe = 0; OPAL_THREAD_LOCK(&oshmem_proc_lock); /* allocate an array */ proc_array = (oshmem_proc_t**) malloc(pe_size * sizeof(oshmem_proc_t*)); if (NULL == proc_array) { OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return NULL ; } group->my_pe = OSHMEM_PROC_VPID(oshmem_proc_local()); group->is_member = 0; /* now save only the procs that match this jobid */ for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list); proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list); proc = (oshmem_proc_t*) opal_list_get_next(proc)) { if (count_pe >= (int) pe_size) { break; } else if ((cur_pe >= pe_start) && ((pe_stride == 0) || (((cur_pe - pe_start) % pe_stride) == 0))) { proc_array[count_pe++] = proc; if (oshmem_proc_pe(proc) == group->my_pe) group->is_member = 1; } cur_pe++; } group->proc_array = proc_array; group->proc_count = (int) count_pe; group->ompi_comm = NULL; /* Prepare peers list */ OBJ_CONSTRUCT(&(group->peer_list), opal_list_t); { orte_namelist_t *peer = NULL; for (i = 0; i < group->proc_count; i++) { peer = OBJ_NEW(orte_namelist_t); peer->name.jobid = OSHMEM_PROC_JOBID(group->proc_array[i]); peer->name.vpid = OSHMEM_PROC_VPID(group->proc_array[i]); opal_list_append(&(group->peer_list), &peer->super); } } group->id = opal_pointer_array_add(&oshmem_group_array, group); memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t)); if (OSHMEM_SUCCESS != mca_scoll_base_select(group)) { opal_output(0, "Error: No collective modules are available: group is not created, returning NULL"); oshmem_proc_group_destroy(group); OPAL_THREAD_UNLOCK(&oshmem_proc_lock); return NULL ; } OPAL_THREAD_UNLOCK(&oshmem_proc_lock); } return group; } OSHMEM_DECLSPEC void oshmem_proc_group_destroy(oshmem_group_t* group) { if (group) { mca_scoll_base_group_unselect(group); /* Destroy proc array */ if (group->proc_array) { free(group->proc_array); } /* Destroy peer list */ { opal_list_item_t *item; while (NULL != (item = opal_list_remove_first(&(group->peer_list)))) { /* destruct the item (we constructed it), then free the memory chunk */ OBJ_RELEASE(item); } OBJ_DESTRUCT(&(group->peer_list)); } /* reset the oshmem_group_array entry - make sure that the * entry is in the table */ if (NULL != opal_pointer_array_get_item(&oshmem_group_array, group->id)) { opal_pointer_array_set_item(&oshmem_group_array, group->id, NULL ); } OBJ_RELEASE(group); } }