Merge pull request #3181 from artpol84/add_proc_fix_2/master
ompi: Avoid unnecessary PMIx lookups when adding procs (step 2).
Этот коммит содержится в:
Коммит
45b46dc446
102
ompi/proc/proc.c
102
ompi/proc/proc.c
@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
|
||||
opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
|
||||
proc);
|
||||
|
||||
/* by default we consider process to be remote */
|
||||
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
|
||||
*procp = proc;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
|
||||
*/
|
||||
int ompi_proc_complete_init_single (ompi_proc_t *proc)
|
||||
{
|
||||
uint16_t u16, *u16ptr;
|
||||
int ret;
|
||||
|
||||
u16ptr = &u16;
|
||||
|
||||
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
|
||||
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
|
||||
/* nothing else to do */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* get the locality information - all RTEs are required
|
||||
* to provide this information at startup */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
|
||||
} else {
|
||||
proc->super.proc_flags = u16;
|
||||
}
|
||||
|
||||
/* we can retrieve the hostname at no cost because it
|
||||
* was provided at startup - but make it optional so
|
||||
* we don't chase after it if some system doesn't
|
||||
@ -287,20 +277,6 @@ int ompi_proc_init(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
|
||||
/* create proc structures and find self */
|
||||
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
|
||||
if (i == OMPI_PROC_MY_NAME->vpid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -329,11 +305,44 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
|
||||
*/
|
||||
int ompi_proc_complete_init(void)
|
||||
{
|
||||
opal_process_name_t wildcard_rank;
|
||||
ompi_proc_t *proc;
|
||||
int ret, errcode = OMPI_SUCCESS;
|
||||
char *val;
|
||||
|
||||
opal_mutex_lock (&ompi_proc_lock);
|
||||
|
||||
/* Add all local peers first */
|
||||
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
|
||||
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
|
||||
/* retrieve the local peers */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
|
||||
&wildcard_rank, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
char **peers = opal_argv_split(val, ',');
|
||||
int i;
|
||||
free(val);
|
||||
for (i=0; NULL != peers[i]; i++) {
|
||||
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
|
||||
uint16_t u16, *u16ptr = &u16;
|
||||
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
|
||||
continue;
|
||||
}
|
||||
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, local_rank, &proc);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
/* get the locality information - all RTEs are required
|
||||
* to provide this information at startup */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
proc->super.proc_flags = u16;
|
||||
}
|
||||
}
|
||||
opal_argv_free(peers);
|
||||
}
|
||||
|
||||
/* Complete initialization of node-local procs */
|
||||
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
|
||||
ret = ompi_proc_complete_init_single (proc);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
@ -341,35 +350,32 @@ int ompi_proc_complete_init(void)
|
||||
break;
|
||||
}
|
||||
}
|
||||
opal_mutex_unlock (&ompi_proc_lock);
|
||||
|
||||
if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
|
||||
char *val = NULL;
|
||||
opal_process_name_t wildcard_rank;
|
||||
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
|
||||
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
|
||||
/* retrieve the local peers */
|
||||
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
|
||||
&wildcard_rank, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
char **peers = opal_argv_split(val, ',');
|
||||
int i;
|
||||
free(val);
|
||||
for (i=0; NULL != peers[i]; i++) {
|
||||
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
|
||||
opal_process_name_t proc_name = {.vpid = local_rank, .jobid = OMPI_PROC_MY_NAME->jobid};
|
||||
/* if cutoff is larger than # of procs - add all processes
|
||||
* NOTE that local procs will be automatically skipped as they
|
||||
* are already in the hash table
|
||||
*/
|
||||
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
|
||||
/* sinse ompi_proc_for_name is locking internally -
|
||||
* we need to release lock here
|
||||
*/
|
||||
opal_mutex_unlock (&ompi_proc_lock);
|
||||
|
||||
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
|
||||
continue;
|
||||
}
|
||||
(void) ompi_proc_for_name (proc_name);
|
||||
}
|
||||
opal_argv_free(peers);
|
||||
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
|
||||
opal_process_name_t proc_name;
|
||||
proc_name.jobid = OMPI_PROC_MY_NAME->jobid;
|
||||
proc_name.vpid = i;
|
||||
(void) ompi_proc_for_name (proc_name);
|
||||
}
|
||||
|
||||
/* acquire lock back for the next step - sort */
|
||||
opal_mutex_lock (&ompi_proc_lock);
|
||||
}
|
||||
|
||||
opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);
|
||||
|
||||
opal_mutex_unlock (&ompi_proc_lock);
|
||||
|
||||
return errcode;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user