osc/rdma: use only a single btl registration for local state
This commit fixes a bug that can occur on Cray Gemini networks. If multiple registrations are used for the local state then we looks the atomicity guarantees. To avoid issues like this use only a single registration handle for all local state on a node. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
f690fc8fd5
Коммит
63e744ffc6
@ -326,7 +326,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
|
|||||||
region->len = size;
|
region->len = size;
|
||||||
|
|
||||||
if (module->selected_btl->btl_register_mem && size) {
|
if (module->selected_btl->btl_register_mem && size) {
|
||||||
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) {
|
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) {
|
||||||
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
|
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
|
||||||
&module->base_handle);
|
&module->base_handle);
|
||||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||||
@ -450,6 +450,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
size_t local_rank_array_size, leader_peer_data_size;
|
size_t local_rank_array_size, leader_peer_data_size;
|
||||||
int my_rank = ompi_comm_rank (module->comm);
|
int my_rank = ompi_comm_rank (module->comm);
|
||||||
int global_size = ompi_comm_size (module->comm);
|
int global_size = ompi_comm_size (module->comm);
|
||||||
|
ompi_osc_rdma_region_t *state_region;
|
||||||
int my_base_offset = 0;
|
int my_base_offset = 0;
|
||||||
struct _local_data *temp;
|
struct _local_data *temp;
|
||||||
char *data_file;
|
char *data_file;
|
||||||
@ -470,8 +471,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
leader_peer_data_size = module->region_size * module->node_count;
|
leader_peer_data_size = module->region_size * module->node_count;
|
||||||
|
|
||||||
/* calculate base offsets */
|
/* calculate base offsets */
|
||||||
module->state_offset = state_base = local_rank_array_size;
|
module->state_offset = state_base = local_rank_array_size + module->region_size;
|
||||||
data_base = local_rank_array_size + leader_peer_data_size + module->state_size * local_size;
|
data_base = state_base + leader_peer_data_size + module->state_size * local_size;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
temp = calloc (local_size, sizeof (temp[0]));
|
temp = calloc (local_size, sizeof (temp[0]));
|
||||||
@ -533,12 +534,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
|
|
||||||
|
|
||||||
if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
|
if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
|
||||||
*base = (void *)((intptr_t) module->segment_base + my_base_offset);
|
*base = (void *)((intptr_t) module->segment_base + my_base_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
|
||||||
|
/* put local state region data after the rank array */
|
||||||
|
state_region = (ompi_osc_rdma_region_t *) ((uintptr_t) module->segment_base + local_rank_array_size);
|
||||||
module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank);
|
module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank);
|
||||||
|
|
||||||
/* all local ranks share the array containing the peer data of leader ranks */
|
/* all local ranks share the array containing the peer data of leader ranks */
|
||||||
@ -547,6 +549,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
/* initialize my state */
|
/* initialize my state */
|
||||||
memset (module->state, 0, module->state_size);
|
memset (module->state, 0, module->state_size);
|
||||||
|
|
||||||
|
if (0 == local_rank) {
|
||||||
/* just go ahead and register the whole segment */
|
/* just go ahead and register the whole segment */
|
||||||
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
|
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
|
||||||
&module->state_handle);
|
&module->state_handle);
|
||||||
@ -554,6 +557,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
state_region->base = (intptr_t) module->segment_base;
|
||||||
|
if (module->state_handle) {
|
||||||
|
memcpy (state_region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
|
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
|
||||||
ret = ompi_osc_rdma_initialize_region (module, base, size);
|
ret = ompi_osc_rdma_initialize_region (module, base, size);
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
@ -572,6 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
offset = data_base;
|
offset = data_base;
|
||||||
for (int i = 0 ; i < local_size ; ++i) {
|
for (int i = 0 ; i < local_size ; ++i) {
|
||||||
ompi_osc_rdma_peer_extended_t *ex_peer;
|
ompi_osc_rdma_peer_extended_t *ex_peer;
|
||||||
|
ompi_osc_rdma_state_t *peer_state;
|
||||||
ompi_osc_rdma_peer_t *peer;
|
ompi_osc_rdma_peer_t *peer;
|
||||||
int peer_rank = temp[i].rank;
|
int peer_rank = temp[i].rank;
|
||||||
|
|
||||||
@ -582,21 +592,24 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
|
|||||||
|
|
||||||
ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
|
ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
|
||||||
|
|
||||||
peer->state = (osc_rdma_counter_t) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
|
/* peer state local pointer */
|
||||||
|
peer_state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
|
||||||
|
|
||||||
if (local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB)) {
|
if (local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB)) {
|
||||||
/* all peers are local or it is safe to mix cpu and nic atomics */
|
/* all peers are local or it is safe to mix cpu and nic atomics */
|
||||||
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
|
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
|
||||||
|
peer->state = (osc_rdma_counter_t) peer_state;
|
||||||
} else {
|
} else {
|
||||||
/* use my endpoint handle to modify the peer's state */
|
/* use my endpoint handle to modify the peer's state */
|
||||||
peer->state_handle = module->state_handle;
|
if (module->selected_btl->btl_register_mem) {
|
||||||
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, my_rank);
|
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
|
||||||
|
}
|
||||||
|
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
|
||||||
|
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* finish setting up the local peer structure */
|
/* finish setting up the local peer structure */
|
||||||
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
|
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
|
||||||
ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) (intptr_t) peer->state;
|
|
||||||
|
|
||||||
if (!module->same_disp_unit) {
|
if (!module->same_disp_unit) {
|
||||||
ex_peer->disp_unit = peer_state->disp_unit;
|
ex_peer->disp_unit = peer_state->disp_unit;
|
||||||
}
|
}
|
||||||
@ -1050,8 +1063,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
|
|||||||
|
|
||||||
/* calculate and store various structure sizes */
|
/* calculate and store various structure sizes */
|
||||||
|
|
||||||
/* the following two structures have similar usage but the later is meant to be a small as possible. they may
|
|
||||||
* be merged into a single structure in a later version of this component. */
|
|
||||||
module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t);
|
module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t);
|
||||||
|
|
||||||
module->state_size = sizeof (ompi_osc_rdma_state_t);
|
module->state_size = sizeof (ompi_osc_rdma_state_t);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user