1
1

ompi/proc: add proc hash table for ompi_proc_t objects

This commit adds an opal hash table to keep track of mapping between
process identifiers and ompi_proc_t's. This hash table is used by the
ompi_proc_by_name() function to lookup (in O(1) time) a given
process. This can be used by a BTL or other component to get a
ompi_proc_t when handling an incoming message from an as yet unknown
peer.

Additionally, this commit adds a new MCA variable to control the new
add_procs behavior: mpi_add_procs_cutoff. If the number of ranks in
the process falls below the threshold a ompi_proc_t is created for
every process. If the number of ranks is above the threshold then a
ompi_proc_t is only created for the local rank. The code needed to
generate additional ompi_proc_t's for a communicator is not yet
complete.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-05-04 16:11:34 -06:00
родитель 6f8f2325ed
Коммит 408da16d50
12 изменённых файлов: 396 добавлений и 184 удалений

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014 Research Organization for Information Science
@ -43,6 +44,8 @@
static opal_list_t ompi_proc_list;
static opal_mutex_t ompi_proc_lock;
static opal_hash_table_t ompi_proc_hash;
ompi_proc_t* ompi_proc_local_proc = NULL;
static void ompi_proc_construct(ompi_proc_t* proc);
@ -83,49 +86,223 @@ void ompi_proc_destruct(ompi_proc_t* proc)
}
OPAL_THREAD_LOCK(&ompi_proc_lock);
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
}
/**
* Allocate a new ompi_proc_T for the given jobid/vpid
*
* @param[in] jobid Job identifier
* @param[in] vpid Process identifier
* @param[out] procp New ompi_proc_t structure
*
* This function allocates a new ompi_proc_t and inserts it into
* the process list and hash table.
*/
static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t **procp) {
ompi_proc_t *proc = OBJ_NEW(ompi_proc_t);
opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc);
OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = jobid;
OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid = vpid;
opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
proc);
*procp = proc;
return OMPI_SUCCESS;
}
/**
* Finish setting up an ompi_proc_t
*
* @param[in] proc ompi process structure
*
* This function contains the core code of ompi_proc_complete_init() and
* ompi_proc_refresh(). The tasks performed by this function include
* retrieving the hostname (if below the modex cutoff), determining the
* remote architecture, and calculating the locality of the process.
*/
static int ompi_proc_complete_init_single (ompi_proc_t *proc)
{
uint16_t u16, *u16ptr;
int ret;
u16ptr = &u16;
if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid) {
/* nothing else to do */
return OMPI_SUCCESS;
}
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
} else {
proc->super.proc_flags = u16;
}
/* we can retrieve the hostname at no cost because it
* was provided at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
(char**)&(proc->super.proc_hostname), OPAL_STRING);
if (OPAL_SUCCESS != ret) {
return ret;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture - this might force a modex except
* for those environments where the RM provides it */
{
uint32_t *ui32ptr;
ui32ptr = &(proc->super.proc_arch);
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name,
(void**)&ui32ptr, OPAL_UINT32);
if (OPAL_SUCCESS == ret) {
/* if arch is different than mine, create a new convertor for this proc */
if (proc->super.proc_arch != opal_local_arch) {
OBJ_RELEASE(proc->super.proc_convertor);
proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0);
}
} else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
proc->super.proc_arch = opal_local_arch;
} else {
return ret;
}
}
#else
/* must be same arch as my own */
proc->super.proc_arch = opal_local_arch;
#endif
return OMPI_SUCCESS;
}
opal_proc_t *ompi_proc_lookup (const opal_process_name_t proc_name)
{
ompi_proc_t *proc = NULL;
int ret;
/* try to lookup the value in the hash table */
ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc);
if (OPAL_SUCCESS == ret) {
return &proc->super;
}
return NULL;
}
opal_proc_t *ompi_proc_for_name (const opal_process_name_t proc_name)
{
ompi_proc_t *proc = NULL;
int ret;
/* try to lookup the value in the hash table */
ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc);
if (OPAL_SUCCESS == ret) {
return &proc->super;
}
OPAL_THREAD_LOCK(&ompi_proc_lock);
do {
/* double-check that another competing thread has not added this proc */
ret = opal_hash_table_get_value_ptr (&ompi_proc_hash, &proc_name, sizeof (proc_name), (void **) &proc);
if (OPAL_SUCCESS == ret) {
break;
}
/* allocate a new ompi_proc_t object for the process and insert it into the process table */
ret = ompi_proc_allocate (proc_name.jobid, proc_name.vpid, &proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* allocation fail */
break;
}
/* finish filling in the important proc data fields */
ret = ompi_proc_complete_init_single (proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
proc = NULL;
break;
}
} while (0);
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
return (opal_proc_t *) proc;
}
int ompi_proc_init(void)
{
ompi_vpid_t i;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
int opal_proc_hash_init_size = (ompi_process_info.num_procs < ompi_add_procs_cutoff) ? ompi_process_info.num_procs :
1024;
ompi_proc_t *proc;
int ret;
#endif
OBJ_CONSTRUCT(&ompi_proc_list, opal_list_t);
OBJ_CONSTRUCT(&ompi_proc_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ompi_proc_hash, opal_hash_table_t);
/* create proc structures and find self */
for( i = 0; i < ompi_process_info.num_procs; i++ ) {
ompi_proc_t *proc = OBJ_NEW(ompi_proc_t);
opal_list_append(&ompi_proc_list, (opal_list_item_t*)proc);
ret = opal_hash_table_init (&ompi_proc_hash, opal_proc_hash_init_size);
if (OPAL_SUCCESS != ret) {
return ret;
}
OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = OMPI_PROC_MY_NAME->jobid;
OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid = i;
/* create a proc for the local process */
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, OMPI_PROC_MY_NAME->vpid, &proc);
if (OMPI_SUCCESS != ret) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (i == OMPI_PROC_MY_NAME->vpid) {
ompi_proc_local_proc = proc;
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
proc->super.proc_hostname = strdup(ompi_process_info.nodename);
proc->super.proc_arch = opal_local_arch;
/* Register the local proc with OPAL */
opal_proc_local_set(&proc->super);
/* set local process data */
ompi_proc_local_proc = proc;
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
proc->super.proc_hostname = strdup(ompi_process_info.nodename);
proc->super.proc_arch = opal_local_arch;
/* Register the local proc with OPAL */
opal_proc_local_set(&proc->super);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* add our arch to the modex */
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL,
OPAL_PMIX_ARCH, &opal_local_arch, OPAL_UINT32);
if (OPAL_SUCCESS != ret) {
/* add our arch to the modex */
OPAL_MODEX_SEND_VALUE(ret, PMIX_GLOBAL,
OPAL_PMIX_ARCH, &opal_local_arch, OPAL_UINT32);
if (OPAL_SUCCESS != ret) {
return ret;
}
#endif
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
/* create proc structures and find self */
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
if (i == OMPI_PROC_MY_NAME->vpid) {
continue;
}
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
if (OMPI_SUCCESS != ret) {
return ret;
}
#endif
}
}
return OMPI_SUCCESS;
}
static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
{
ompi_proc_t *proca = (ompi_proc_t *) *a;
ompi_proc_t *procb = (ompi_proc_t *) *b;
if (proca->super.proc_name.vpid > procb->super.proc_name.vpid) {
return 1;
} else {
return -1;
}
/* they should never be equal */
}
/**
* The process creation is split into two steps. The second step
@ -140,58 +317,47 @@ int ompi_proc_complete_init(void)
{
ompi_proc_t *proc;
int ret, errcode = OMPI_SUCCESS;
uint16_t u16, *u16ptr;
OPAL_THREAD_LOCK(&ompi_proc_lock);
u16ptr = &u16;
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
if (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid != OMPI_PROC_MY_NAME->vpid) {
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
} else {
proc->super.proc_flags = u16;
}
/* we can retrieve the hostname at no cost because it
* was provided at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
(char**)&(proc->super.proc_hostname), OPAL_STRING);
if (OPAL_SUCCESS != ret) {
/* we can live without it */
proc->super.proc_hostname = NULL;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
/* get the remote architecture - this might force a modex except
* for those environments where the RM provides it */
{
uint32_t *ui32ptr;
ui32ptr = &(proc->super.proc_arch);
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name,
(void**)&ui32ptr, OPAL_UINT32);
if (OPAL_SUCCESS == ret) {
/* if arch is different than mine, create a new convertor for this proc */
if (proc->super.proc_arch != opal_local_arch) {
OBJ_RELEASE(proc->super.proc_convertor);
proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0);
}
} else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
proc->super.proc_arch = opal_local_arch;
} else {
errcode = ret;
break;
}
}
#else
/* must be same arch as my own */
proc->super.proc_arch = opal_local_arch;
#endif
ret = ompi_proc_complete_init_single (proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
errcode = ret;
break;
}
}
OPAL_THREAD_UNLOCK(&ompi_proc_lock);
if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
uint16_t u16, *u16ptr;
u16ptr = &u16;
/* find and add all local processes */
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
opal_process_name_t proc_name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid};
uint16_t locality = OPAL_PROC_NON_LOCAL;
if (OMPI_PROC_MY_NAME->vpid == i) {
continue;
}
/* the runtime is required to fill in locality for all local processes by this
* point. only local processes will have locality set */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS == ret) {
locality = u16;
}
if (OPAL_PROC_NON_LOCAL != locality) {
(void) ompi_proc_for_name (proc_name);
}
}
}
opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);
return errcode;
}
@ -227,6 +393,7 @@ int ompi_proc_finalize (void)
/* now destruct the list and thread lock */
OBJ_DESTRUCT(&ompi_proc_list);
OBJ_DESTRUCT(&ompi_proc_lock);
OBJ_DESTRUCT(&ompi_proc_hash);
return OMPI_SUCCESS;
}
@ -248,9 +415,7 @@ ompi_proc_t** ompi_proc_world(size_t *size)
/* First count how many match this jobid */
OPAL_THREAD_LOCK(&ompi_proc_lock);
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, OMPI_CAST_RTE_NAME(&proc->super.proc_name), &my_name)) {
++count;
}
@ -265,9 +430,7 @@ ompi_proc_t** ompi_proc_world(size_t *size)
/* now save only the procs that match this jobid */
count = 0;
for (proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, &my_name)) {
/* DO NOT RETAIN THIS OBJECT - the reference count on this
* object will be adjusted by external callers. The intent
@ -305,9 +468,7 @@ ompi_proc_t** ompi_proc_all(size_t* size)
}
OPAL_THREAD_LOCK(&ompi_proc_lock);
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
/* We know this isn't consistent with the behavior in ompi_proc_world,
* but we are leaving the RETAIN for now because the code using this function
* assumes that the results need to be released when done. It will
@ -349,9 +510,7 @@ ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name )
/* return the proc-struct which matches this jobid+process id */
mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
OPAL_THREAD_LOCK(&ompi_proc_lock);
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, name)) {
rproc = proc;
break;
@ -366,21 +525,14 @@ ompi_proc_t * ompi_proc_find ( const ompi_process_name_t * name )
int ompi_proc_refresh(void)
{
ompi_proc_t *proc = NULL;
opal_list_item_t *item = NULL;
ompi_vpid_t i = 0;
int ret=OMPI_SUCCESS;
uint16_t u16, *u16ptr;
OPAL_THREAD_LOCK(&ompi_proc_lock);
for( item = opal_list_get_first(&ompi_proc_list), i = 0;
item != opal_list_get_end(&ompi_proc_list);
item = opal_list_get_next(item), ++i ) {
proc = (ompi_proc_t*)item;
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
/* Does not change: proc->super.proc_name.vpid */
OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid = OMPI_PROC_MY_NAME->jobid;
u16ptr = &u16;
/* Make sure to clear the local flag before we set it below */
proc->super.proc_flags = 0;
@ -392,56 +544,10 @@ int ompi_proc_refresh(void)
proc->super.proc_arch = opal_local_arch;
opal_proc_local_set(&proc->super);
} else {
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
} else {
proc->super.proc_flags = u16;
ret = ompi_proc_complete_init_single (proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
if (ompi_process_info.num_procs < ompi_direct_modex_cutoff) {
/* IF the number of procs falls below the specified cutoff,
* then we assume the job is small enough that retrieving
* the hostname (which will typically cause retrieval of
* ALL modex info for this proc) will have no appreciable
* impact on launch scaling
*/
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->super.proc_name,
(char**)&(proc->super.proc_hostname), OPAL_STRING);
if (OMPI_SUCCESS != ret) {
break;
}
} else {
/* just set the hostname to NULL for now - we'll fill it in
* as modex_recv's are called for procs we will talk to, thus
* avoiding retrieval of ALL modex info for this proc until
* required. Transports that delay calling modex_recv until
* first message will therefore scale better than those that
* call modex_recv on all procs during init.
*/
proc->super.proc_hostname = NULL;
}
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
{
/* get the remote architecture */
uint32_t* uiptr = &(proc->super.proc_arch);
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_ARCH, &proc->super.proc_name,
(void**)&uiptr, OPAL_UINT32);
if (OMPI_SUCCESS != ret) {
break;
}
/* if arch is different than mine, create a new convertor for this proc */
if (proc->super.proc_arch != opal_local_arch) {
OBJ_RELEASE(proc->super.proc_convertor);
proc->super.proc_convertor = opal_convertor_create(proc->super.proc_arch, 0);
}
}
#else
/* must be same arch as my own */
proc->super.proc_arch = opal_local_arch;
#endif
}
}
@ -454,7 +560,7 @@ int
ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
opal_buffer_t* buf)
{
int i, rc;
int rc;
OPAL_THREAD_LOCK(&ompi_proc_lock);
@ -470,7 +576,7 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
* reduced. For now, just go ahead and pack the info so it
* can be sent.
*/
for (i=0; i<proclistsize; i++) {
for (int i = 0 ; i < proclistsize ; ++i) {
rc = opal_dss.pack(buf, &(proclist[i]->super.proc_name), 1, OMPI_NAME);
if(rc != OPAL_SUCCESS) {
OMPI_ERROR_LOG(rc);
@ -503,9 +609,7 @@ ompi_proc_find_and_add(const ompi_process_name_t * name, bool* isnew)
/* return the proc-struct which matches this jobid+process id */
mask = OMPI_RTE_CMP_JOBID | OMPI_RTE_CMP_VPID;
OPAL_THREAD_LOCK(&ompi_proc_lock);
for(proc = (ompi_proc_t*)opal_list_get_first(&ompi_proc_list);
proc != (ompi_proc_t*)opal_list_get_end(&ompi_proc_list);
proc = (ompi_proc_t*)opal_list_get_next(proc)) {
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, &proc->super.proc_name, name)) {
rproc = proc;
*isnew = false;
@ -538,7 +642,6 @@ ompi_proc_unpack(opal_buffer_t* buf,
int proclistsize, ompi_proc_t ***proclist,
int *newproclistsize, ompi_proc_t ***newproclist)
{
int i;
size_t newprocs_len = 0;
ompi_proc_t **plist=NULL, **newprocs = NULL;
@ -558,7 +661,7 @@ ompi_proc_unpack(opal_buffer_t* buf,
/* cycle through the array of provided procs and unpack
* their info - as packed by ompi_proc_pack
*/
for (i=0; i<proclistsize; i++){
for (int i = 0; i < proclistsize ; ++i){
int32_t count=1;
ompi_process_name_t new_name;
uint32_t new_arch;

Просмотреть файл

@ -304,6 +304,25 @@ OMPI_DECLSPEC int ompi_proc_unpack(opal_buffer_t *buf,
*/
OMPI_DECLSPEC int ompi_proc_refresh(void);
/**
* Get the ompi_proc_t for a given process name
*
* @param[in] proc_name opal process name
*
* @returns cached or new ompi_proc_t for the given process name
*
* This function looks up the given process name in the hash of existing
* ompi_proc_t structures. If no ompi_proc_t structure exists matching the
* given name a new ompi_proc_t is allocated, initialized, and returned.
*
* @note The ompi_proc_t is added to the local list of processes but is not
* added to any communicator. ompi_comm_peer_lookup is responsible for caching
* the ompi_proc_t on a communicator.
*/
OMPI_DECLSPEC opal_proc_t *ompi_proc_for_name (const opal_process_name_t proc_name);
OMPI_DECLSPEC opal_proc_t *ompi_proc_lookup (const opal_process_name_t proc_name);
END_C_DECLS

Просмотреть файл

@ -400,6 +400,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
opal_compare_proc = _process_name_compare;
opal_convert_string_to_process_name = _convert_string_to_process_name;
opal_convert_process_name_to_string = _convert_process_name_to_string;
opal_proc_for_name = ompi_proc_for_name;
/* Register MCA variables */
if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) {

Просмотреть файл

@ -64,6 +64,7 @@ int ompi_mpi_event_tick_rate = -1;
char *ompi_mpi_show_mca_params_string = NULL;
bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
bool ompi_mpi_preconnect_mpi = false;
uint32_t ompi_add_procs_cutoff = 1024;
static bool show_default_mca_params = false;
static bool show_file_mca_params = false;
@ -288,6 +289,16 @@ int ompi_mpi_register_params(void)
ompi_rte_abort(1, NULL);
}
ompi_add_procs_cutoff = 1024;
(void) mca_base_var_register ("ompi", "mpi", NULL, "add_procs_cutoff",
"Maximum world size for pre-allocating resources for all "
"remote processes. Increasing this limit may improve "
"communication performance at the cost of memory usage "
"(default: 1024)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL,
0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&ompi_add_procs_cutoff);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -9,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
@ -123,11 +124,16 @@ OMPI_DECLSPEC extern bool ompi_have_sparse_group_storage;
*/
OMPI_DECLSPEC extern bool ompi_use_sparse_group_storage;
/*
/**
* Cutoff point for retrieving hostnames
*/
OMPI_DECLSPEC extern uint32_t ompi_direct_modex_cutoff;
/**
* Cutoff point for calling add_procs for all processes
*/
OMPI_DECLSPEC extern uint32_t ompi_add_procs_cutoff;
/**
* Register MCA parameters used by the MPI layer.
*

Просмотреть файл

@ -49,7 +49,7 @@
/* ompi and smsg endpoint attributes */
typedef struct mca_btl_ugni_endpoint_attr_t {
uint64_t proc_id;
opal_process_name_t proc_name;
uint32_t index;
gni_smsg_attr_t smsg_attr;
gni_mem_handle_t rmt_irq_mem_hndl;
@ -67,6 +67,7 @@ typedef struct mca_btl_ugni_module_t {
opal_common_ugni_device_t *device;
opal_mutex_t endpoint_lock;
size_t endpoint_count;
opal_pointer_array_t endpoints;
opal_hash_table_t id_to_endpoint;
@ -229,6 +230,8 @@ mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl,
struct opal_proc_t **procs,
struct mca_btl_base_endpoint_t **peers);
struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc);
/**
* Initiate an asynchronous send.
*

Просмотреть файл

@ -28,13 +28,11 @@ static void
mca_btl_ugni_module_set_max_reg (mca_btl_ugni_module_t *ugni_module, int nlocal_procs);
static int mca_btl_ugni_smsg_setup (int nprocs);
int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
size_t nprocs,
struct opal_proc_t **procs,
struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable) {
int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
struct opal_proc_t **procs,
struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable) {
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
size_t i;
int rc;
void *mmap_start_addr;
@ -59,36 +57,45 @@ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t* btl,
}
}
for (i = 0 ; i < nprocs ; ++i) {
for (size_t i = 0 ; i < nprocs ; ++i) {
struct opal_proc_t *opal_proc = procs[i];
uint64_t proc_id = mca_btl_ugni_proc_name_to_id(opal_proc->proc_name);
if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
ugni_module->nlocal_procs++;
/* check for an existing endpoint */
OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) (peers + i))) {
if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) {
ugni_module->nlocal_procs++;
/* ugni is allowed on local processes to provide support for network
* atomic operations */
/* ugni is allowed on local processes to provide support for network
* atomic operations */
}
/* Create and Init endpoints */
rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
BTL_ERROR(("btl/ugni error initializing endpoint"));
return rc;
}
/* go ahead and connect the local endpoint for RDMA/CQ write */
if (opal_proc == opal_proc_local_get ()) {
ugni_module->local_ep = peers[i];
}
/* Add this endpoint to the pointer array. */
BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i]));
opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]);
++ugni_module->endpoint_count;
}
OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
/* Create and Init endpoints */
rc = mca_btl_ugni_init_ep (ugni_module, peers + i, (mca_btl_ugni_module_t *) btl, opal_proc);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("btl/ugni error initializing endpoint"));
return rc;
/* Set the reachable bit if necessary */
if (reachable) {
rc = opal_bitmap_set_bit (reachable, i);
}
/* go ahead and connect the local endpoint for RDMA/CQ write */
if (opal_proc == opal_proc_local_get ()) {
ugni_module->local_ep = peers[i];
}
/* Add this endpoint to the pointer array. */
BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) peers[i]));
opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, peers[i]);
/* Set the reachable bit */
rc = opal_bitmap_set_bit (reachable, i);
++ugni_module->endpoint_count;
}
mca_btl_ugni_module_set_max_reg (ugni_module, ugni_module->nlocal_procs);
@ -224,6 +231,41 @@ int mca_btl_ugni_del_procs (struct mca_btl_base_module_t *btl,
return OPAL_SUCCESS;
}
struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc)
{
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) module;
uint64_t proc_id = mca_btl_ugni_proc_name_to_id(proc->proc_name);
mca_btl_base_endpoint_t *ep;
int rc;
OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
do {
rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep);
if (OPAL_SUCCESS == rc) {
OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
break;
}
/* Create and Init endpoints */
rc = mca_btl_ugni_init_ep (ugni_module, &ep, ugni_module, proc);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
BTL_ERROR(("btl/ugni error initializing endpoint"));
break;
}
/* Add this endpoint to the pointer array. */
BTL_VERBOSE(("initialized uGNI endpoint for proc id: 0x%" PRIx64 " ptr: %p", proc_id, (void *) ep));
opal_hash_table_set_value_uint64 (&ugni_module->id_to_endpoint, proc_id, ep);
} while (0);
OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
return ep;
}
static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size,
mca_mpool_base_registration_t *reg)
{

Просмотреть файл

@ -386,8 +386,8 @@ mca_btl_ugni_component_init (int *num_btl_modules,
static inline int
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
{
uint64_t datagram_id, data, proc_id;
uint32_t remote_addr, remote_id;
uint64_t datagram_id, data;
mca_btl_base_endpoint_t *ep;
gni_post_state_t post_state;
gni_ep_handle_t handle;
@ -425,15 +425,24 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
/* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
if (handle == ugni_module->wildcard_ep) {
BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, ugni_module->wc_remote_attr.proc_id));
rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint,
ugni_module->wc_remote_attr.proc_id,
(void *) &ep);
proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name);
BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64,
proc_id));
OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep);
OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);
/* check if the endpoint is known */
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) {
BTL_ERROR(("received connection attempt from an unknown peer. rc: %d, ep: %p, id: 0x%" PRIx64,
rc, (void *) ep, ugni_module->wc_remote_attr.proc_id));
return OPAL_ERR_NOT_FOUND;
struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}",
ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid));
ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
if (OPAL_UNLIKELY(NULL == ep)) {
return rc;
}
}
} else {
BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));

Просмотреть файл

@ -91,6 +91,7 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t);
OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
OBJ_CONSTRUCT(&ugni_module->endpoint_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t);
@ -208,6 +209,7 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
OBJ_DESTRUCT(&ugni_module->smsg_mboxes);
OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
OBJ_DESTRUCT(&ugni_module->endpoint_lock);
OBJ_DESTRUCT(&ugni_module->endpoints);
OBJ_DESTRUCT(&ugni_module->eager_get_pending);

Просмотреть файл

@ -27,7 +27,7 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) {
mbox->attr.smsg_attr.msg_buffer = base_reg->base;
mbox->attr.smsg_attr.buff_size = mca_btl_ugni_component.smsg_mbox_size;
mbox->attr.smsg_attr.mem_hndl = ugni_reg->handle.gni_handle;
mbox->attr.proc_id = mca_btl_ugni_proc_name_to_id (OPAL_PROC_MY_NAME);
mbox->attr.proc_name = OPAL_PROC_MY_NAME;
mbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
@ -6,6 +7,8 @@
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -162,6 +165,11 @@ static int opal_convert_string_to_jobid_should_never_be_called(opal_jobid_t *job
return OPAL_ERR_NOT_SUPPORTED;
}
static struct opal_proc_t *opal_proc_for_name_should_never_be_called (opal_process_name_t name)
{
return NULL;
}
char* (*opal_process_name_print)(const opal_process_name_t) = opal_process_name_print_should_never_be_called;
char* (*opal_vpid_print)(const opal_vpid_t) = opal_vpid_print_should_never_be_called;
char* (*opal_jobid_print)(const opal_jobid_t) = opal_jobid_print_should_never_be_called;
@ -169,6 +177,7 @@ int (*opal_convert_string_to_process_name)(opal_process_name_t *name, const char
int (*opal_convert_process_name_to_string)(char** name_string, const opal_process_name_t *name) = opal_convert_process_name_to_string_should_never_be_called;
char* (*opal_convert_jobid_to_string)(opal_jobid_t jobid) = opal_convert_jobid_to_string_should_never_be_called;
int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string) = opal_convert_string_to_jobid_should_never_be_called;
struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name) = opal_proc_for_name_should_never_be_called;
char* opal_get_proc_hostname(const opal_proc_t *proc)
{

Просмотреть файл

@ -136,6 +136,13 @@ OPAL_DECLSPEC extern char* (*opal_jobid_print)(const opal_jobid_t);
OPAL_DECLSPEC extern char* (*opal_convert_jobid_to_string)(opal_jobid_t jobid);
OPAL_DECLSPEC extern int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_string);
/**
* Lookup an opal_proc_t by name
*
* @param name (IN) name to lookup
*/
OPAL_DECLSPEC extern struct opal_proc_t *(*opal_proc_for_name) (const opal_process_name_t name);
#define OPAL_NAME_PRINT(OPAL_PN) opal_process_name_print(OPAL_PN)
#define OPAL_JOBID_PRINT(OPAL_PN) opal_jobid_print(OPAL_PN)
#define OPAL_VPID_PRINT(OPAL_PN) opal_vpid_print(OPAL_PN)