1
1
Keep track of the connected procs in vader_add_procs().
Otherwise, the same rank will reconnect the same shmem
segment (rank 0+...) multiple times instead of the next
one as intended.

Signed-off-by: Austen Lauria <awlauria@us.ibm.com>
(cherry picked from commit f69c8d6819)
Этот коммит содержится в:
Austen Lauria 2020-03-03 13:31:56 -05:00
родитель e00fc61dcf
Коммит f7979fbc82
3 изменённых файлов: 8 добавлений и 3 удалений

Просмотреть файл

@ -124,6 +124,7 @@ struct mca_btl_vader_component_t {
char *my_segment; /**< this rank's base pointer */
size_t segment_size; /**< size of my_segment */
int32_t num_smp_procs; /**< current number of smp procs on this host */
int32_t local_rank; /**< current rank index at add_procs() time */
opal_free_list_t vader_frags_eager; /**< free list of vader send frags */
opal_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */
opal_free_list_t vader_frags_user; /**< free list of small inline frags */

Просмотреть файл

@ -559,6 +559,8 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
/* no fast boxes allocated initially */
component->num_fbox_in_endpoints = 0;
component->local_rank = 0;
mca_btl_vader_check_single_copy ();
if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) {

Просмотреть файл

@ -349,7 +349,7 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
}
}
for (int32_t proc = 0, local_rank = 0 ; proc < (int32_t) nprocs ; ++proc) {
for (int32_t proc = 0; proc < (int32_t) nprocs; ++proc) {
/* check to see if this proc can be reached via shmem (i.e.,
if they're on my local host and in my job) */
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
@ -367,8 +367,10 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
}
/* setup endpoint */
peers[proc] = component->endpoints + local_rank;
rc = init_vader_endpoint (peers[proc], procs[proc], local_rank++);
int rank = opal_atomic_fetch_add_32(&component -> local_rank, 1);
peers[proc] = component->endpoints + rank;
rc = init_vader_endpoint (peers[proc], procs[proc], rank);
if (OPAL_SUCCESS != rc) {
break;
}