1
1

Merge pull request #3181 from artpol84/add_proc_fix_2/master

ompi: Avoid unnecessary PMIx lookups when adding procs (step 2).
Этот коммит содержится в:
Ralph Castain 2017-03-16 15:06:08 -07:00 коммит произвёл GitHub
родитель 5219054d29 1f7a3a2d54
Коммит 45b46dc446

Просмотреть файл

@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
proc);
/* by default we consider process to be remote */
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
*procp = proc;
return OMPI_SUCCESS;
@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
*/
int ompi_proc_complete_init_single (ompi_proc_t *proc)
{
uint16_t u16, *u16ptr;
int ret;
u16ptr = &u16;
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
/* nothing else to do */
return OMPI_SUCCESS;
}
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
} else {
proc->super.proc_flags = u16;
}
/* we can retrieve the hostname at no cost because it
* was provided at startup - but make it optional so
* we don't chase after it if some system doesn't
@ -287,20 +277,6 @@ int ompi_proc_init(void)
}
#endif
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
/* create proc structures and find self */
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
if (i == OMPI_PROC_MY_NAME->vpid) {
continue;
}
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
return OMPI_SUCCESS;
}
@ -329,11 +305,44 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
*/
int ompi_proc_complete_init(void)
{
opal_process_name_t wildcard_rank;
ompi_proc_t *proc;
int ret, errcode = OMPI_SUCCESS;
char *val;
opal_mutex_lock (&ompi_proc_lock);
/* Add all local peers first */
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
char **peers = opal_argv_split(val, ',');
int i;
free(val);
for (i=0; NULL != peers[i]; i++) {
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
uint16_t u16, *u16ptr = &u16;
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
continue;
}
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, local_rank, &proc);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS == ret) {
proc->super.proc_flags = u16;
}
}
opal_argv_free(peers);
}
/* Complete initialization of node-local procs */
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
ret = ompi_proc_complete_init_single (proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@ -341,35 +350,32 @@ int ompi_proc_complete_init(void)
break;
}
}
opal_mutex_unlock (&ompi_proc_lock);
if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
char *val = NULL;
opal_process_name_t wildcard_rank;
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
char **peers = opal_argv_split(val, ',');
int i;
free(val);
for (i=0; NULL != peers[i]; i++) {
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
opal_process_name_t proc_name = {.vpid = local_rank, .jobid = OMPI_PROC_MY_NAME->jobid};
/* if cutoff is larger than # of procs - add all processes
* NOTE that local procs will be automatically skipped as they
* are already in the hash table
*/
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
/* sinse ompi_proc_for_name is locking internally -
* we need to release lock here
*/
opal_mutex_unlock (&ompi_proc_lock);
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
continue;
}
(void) ompi_proc_for_name (proc_name);
}
opal_argv_free(peers);
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
opal_process_name_t proc_name;
proc_name.jobid = OMPI_PROC_MY_NAME->jobid;
proc_name.vpid = i;
(void) ompi_proc_for_name (proc_name);
}
/* acquire lock back for the next step - sort */
opal_mutex_lock (&ompi_proc_lock);
}
opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);
opal_mutex_unlock (&ompi_proc_lock);
return errcode;
}