1
1

osc/rdma: use only a single btl registration for local state

This commit fixes a bug that can occur on Cray Gemini networks. If
multiple registrations are used for the local state then we looks the
atomicity guarantees. To avoid issues like this use only a single
registration handle for all local state on a node.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-10-22 15:51:19 -06:00
родитель f690fc8fd5
Коммит 63e744ffc6

Просмотреть файл

@ -326,7 +326,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
region->len = size; region->len = size;
if (module->selected_btl->btl_register_mem && size) { if (module->selected_btl->btl_register_mem && size) {
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) { if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) {
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY, ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
&module->base_handle); &module->base_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@ -450,6 +450,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
size_t local_rank_array_size, leader_peer_data_size; size_t local_rank_array_size, leader_peer_data_size;
int my_rank = ompi_comm_rank (module->comm); int my_rank = ompi_comm_rank (module->comm);
int global_size = ompi_comm_size (module->comm); int global_size = ompi_comm_size (module->comm);
ompi_osc_rdma_region_t *state_region;
int my_base_offset = 0; int my_base_offset = 0;
struct _local_data *temp; struct _local_data *temp;
char *data_file; char *data_file;
@ -470,8 +471,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
leader_peer_data_size = module->region_size * module->node_count; leader_peer_data_size = module->region_size * module->node_count;
/* calculate base offsets */ /* calculate base offsets */
module->state_offset = state_base = local_rank_array_size; module->state_offset = state_base = local_rank_array_size + module->region_size;
data_base = local_rank_array_size + leader_peer_data_size + module->state_size * local_size; data_base = state_base + leader_peer_data_size + module->state_size * local_size;
do { do {
temp = calloc (local_size, sizeof (temp[0])); temp = calloc (local_size, sizeof (temp[0]));
@ -533,12 +534,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
break; break;
} }
module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
*base = (void *)((intptr_t) module->segment_base + my_base_offset); *base = (void *)((intptr_t) module->segment_base + my_base_offset);
} }
module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;
/* put local state region data after the rank array */
state_region = (ompi_osc_rdma_region_t *) ((uintptr_t) module->segment_base + local_rank_array_size);
module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank); module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank);
/* all local ranks share the array containing the peer data of leader ranks */ /* all local ranks share the array containing the peer data of leader ranks */
@ -547,11 +549,18 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
/* initialize my state */ /* initialize my state */
memset (module->state, 0, module->state_size); memset (module->state, 0, module->state_size);
/* just go ahead and register the whole segment */ if (0 == local_rank) {
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY, /* just go ahead and register the whole segment */
&module->state_handle); ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY,
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { &module->state_handle);
break; if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
state_region->base = (intptr_t) module->segment_base;
if (module->state_handle) {
memcpy (state_region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
}
} }
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
@ -572,6 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
offset = data_base; offset = data_base;
for (int i = 0 ; i < local_size ; ++i) { for (int i = 0 ; i < local_size ; ++i) {
ompi_osc_rdma_peer_extended_t *ex_peer; ompi_osc_rdma_peer_extended_t *ex_peer;
ompi_osc_rdma_state_t *peer_state;
ompi_osc_rdma_peer_t *peer; ompi_osc_rdma_peer_t *peer;
int peer_rank = temp[i].rank; int peer_rank = temp[i].rank;
@ -582,21 +592,24 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
peer->state = (osc_rdma_counter_t) ((uintptr_t) module->segment_base + state_base + module->state_size * i); /* peer state local pointer */
peer_state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * i);
if (local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB)) { if (local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB)) {
/* all peers are local or it is safe to mix cpu and nic atomics */ /* all peers are local or it is safe to mix cpu and nic atomics */
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
peer->state = (osc_rdma_counter_t) peer_state;
} else { } else {
/* use my endpoint handle to modify the peer's state */ /* use my endpoint handle to modify the peer's state */
peer->state_handle = module->state_handle; if (module->selected_btl->btl_register_mem) {
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, my_rank); peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
}
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank);
} }
/* finish setting up the local peer structure */ /* finish setting up the local peer structure */
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) (intptr_t) peer->state;
if (!module->same_disp_unit) { if (!module->same_disp_unit) {
ex_peer->disp_unit = peer_state->disp_unit; ex_peer->disp_unit = peer_state->disp_unit;
} }
@ -1050,8 +1063,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
/* calculate and store various structure sizes */ /* calculate and store various structure sizes */
/* the following two structures have similar usage but the later is meant to be a small as possible. they may
* be merged into a single structure in a later version of this component. */
module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t); module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t);
module->state_size = sizeof (ompi_osc_rdma_state_t); module->state_size = sizeof (ompi_osc_rdma_state_t);