Fix segv in btl/vader.
Keep track of the connected procs in vader_add_procs(). Otherwise, the same rank will reconnect the same shmem segment (rank 0+...) multiple times instead of the next one as intended. Signed-off-by: Austen Lauria <awlauria@us.ibm.com>
Этот коммит содержится в:
родитель
96559fee77
Коммит
f69c8d6819
@ -125,6 +125,7 @@ struct mca_btl_vader_component_t {
|
||||
char *my_segment; /**< this rank's base pointer */
|
||||
size_t segment_size; /**< size of my_segment */
|
||||
int32_t num_smp_procs; /**< current number of smp procs on this host */
|
||||
int32_t local_rank; /**< current rank index at add_procs() time */
|
||||
opal_free_list_t vader_frags_eager; /**< free list of vader send frags */
|
||||
opal_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */
|
||||
opal_free_list_t vader_frags_user; /**< free list of small inline frags */
|
||||
|
@ -560,6 +560,8 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
/* no fast boxes allocated initially */
|
||||
component->num_fbox_in_endpoints = 0;
|
||||
|
||||
component->local_rank = 0;
|
||||
|
||||
mca_btl_vader_check_single_copy ();
|
||||
|
||||
if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) {
|
||||
|
@ -349,7 +349,7 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t proc = 0, local_rank = 0 ; proc < (int32_t) nprocs ; ++proc) {
|
||||
for (int32_t proc = 0; proc < (int32_t) nprocs; ++proc) {
|
||||
/* check to see if this proc can be reached via shmem (i.e.,
|
||||
if they're on my local host and in my job) */
|
||||
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
|
||||
@ -367,8 +367,10 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
/* setup endpoint */
|
||||
peers[proc] = component->endpoints + local_rank;
|
||||
rc = init_vader_endpoint (peers[proc], procs[proc], local_rank++);
|
||||
int rank = opal_atomic_fetch_add_32(&component -> local_rank, 1);
|
||||
|
||||
peers[proc] = component->endpoints + rank;
|
||||
rc = init_vader_endpoint (peers[proc], procs[proc], rank);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
break;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user