From f690fc8fd5d1544655f69ea7364eba1df2d849ee Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 22 Oct 2015 15:50:40 -0600 Subject: [PATCH 1/2] osc/pt2pt: fix warnings Signed-off-by: Nathan Hjelm --- ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c | 11 ++++++----- ompi/mca/osc/pt2pt/osc_pt2pt_sync.c | 5 ++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c b/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c index 22f9d1f7c4..e169addb54 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c @@ -339,8 +339,7 @@ int ompi_osc_pt2pt_complete (ompi_win_t *win) for (size_t i = 0 ; i < group_size ; ++i) { ompi_osc_pt2pt_header_complete_t complete_req; int rank = peers[i]->rank; - ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, rank); - + if (my_rank == rank) { /* shortcut for self */ osc_pt2pt_incoming_complete (module, rank, 0); @@ -349,12 +348,14 @@ int ompi_osc_pt2pt_complete (ompi_win_t *win) complete_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE; complete_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID; -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG + complete_req.frag_count = module->epoch_outgoing_frag_count[rank]; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT +#if OPAL_ENABLE_DEBUG complete_req.padding[0] = 0; complete_req.padding[1] = 0; #endif - complete_req.frag_count = module->epoch_outgoing_frag_count[rank]; - osc_pt2pt_hton(&complete_req, proc); + osc_pt2pt_hton(&complete_req, ompi_comm_peer_lookup (module->comm, rank)); +#endif ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, rank); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c index 74c6be1801..7e28914801 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c @@ -34,7 +34,10 @@ OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_sync_t, opal_free_list_item_t, ompi_osc_pt2pt_sync_t *ompi_osc_pt2pt_sync_allocate (struct ompi_osc_pt2pt_module_t *module) { ompi_osc_pt2pt_sync_t *sync; -#pragma unused (module) + + /* module is not used yet */ + (void) module; + sync = OBJ_NEW (ompi_osc_pt2pt_sync_t); if (OPAL_UNLIKELY(NULL == sync)) { return NULL; From 63e744ffc66cc60fb25befc56b1c2d0a2150cce1 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 22 Oct 2015 15:51:19 -0600 Subject: [PATCH 2/2] osc/rdma: use only a single btl registration for local state This commit fixes a bug that can occur on Cray Gemini networks. If multiple registrations are used for the local state then we looks the atomicity guarantees. To avoid issues like this use only a single registration handle for all local state on a node. Signed-off-by: Nathan Hjelm --- ompi/mca/osc/rdma/osc_rdma_component.c | 45 ++++++++++++++++---------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index ff9e8a1685..066e7777fc 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -326,7 +326,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void region->len = size; if (module->selected_btl->btl_register_mem && size) { - if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor) { + if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) { ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY, &module->base_handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -450,6 +450,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s size_t local_rank_array_size, leader_peer_data_size; int my_rank = ompi_comm_rank (module->comm); int global_size = ompi_comm_size (module->comm); + ompi_osc_rdma_region_t *state_region; int my_base_offset = 0; struct _local_data *temp; char *data_file; @@ -470,8 +471,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s leader_peer_data_size = module->region_size * module->node_count; /* calculate base offsets */ - module->state_offset = state_base = local_rank_array_size; - data_base = local_rank_array_size + leader_peer_data_size + module->state_size * local_size; + module->state_offset = state_base = local_rank_array_size + module->region_size; + data_base = state_base + leader_peer_data_size + module->state_size * local_size; do { temp = calloc (local_size, sizeof (temp[0])); @@ -533,12 +534,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s break; } - module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base; - if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { *base = (void *)((intptr_t) module->segment_base + my_base_offset); } + module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base; + /* put local state region data after the rank array */ + state_region = (ompi_osc_rdma_region_t *) ((uintptr_t) module->segment_base + local_rank_array_size); module->state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * local_rank); /* all local ranks share the array containing the peer data of leader ranks */ @@ -547,11 +549,18 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* initialize my state */ memset (module->state, 0, module->state_size); - /* just go ahead and register the whole segment */ - ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY, - &module->state_handle); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - break; + if (0 == local_rank) { + /* just go ahead and register the whole segment */ + ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, MCA_BTL_REG_FLAG_ACCESS_ANY, + &module->state_handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; + } + + state_region->base = (intptr_t) module->segment_base; + if (module->state_handle) { + memcpy (state_region->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size); + } } if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { @@ -572,6 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s offset = data_base; for (int i = 0 ; i < local_size ; ++i) { ompi_osc_rdma_peer_extended_t *ex_peer; + ompi_osc_rdma_state_t *peer_state; ompi_osc_rdma_peer_t *peer; int peer_rank = temp[i].rank; @@ -582,21 +592,24 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; - peer->state = (osc_rdma_counter_t) ((uintptr_t) module->segment_base + state_base + module->state_size * i); + /* peer state local pointer */ + peer_state = (ompi_osc_rdma_state_t *) ((uintptr_t) module->segment_base + state_base + module->state_size * i); if (local_size == global_size || (module->selected_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB)) { /* all peers are local or it is safe to mix cpu and nic atomics */ peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; + peer->state = (osc_rdma_counter_t) peer_state; } else { /* use my endpoint handle to modify the peer's state */ - peer->state_handle = module->state_handle; - peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, my_rank); + if (module->selected_btl->btl_register_mem) { + peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data; + } + peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i); + peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank); } /* finish setting up the local peer structure */ if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { - ompi_osc_rdma_state_t *peer_state = (ompi_osc_rdma_state_t *) (intptr_t) peer->state; - if (!module->same_disp_unit) { ex_peer->disp_unit = peer_state->disp_unit; } @@ -1050,8 +1063,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* calculate and store various structure sizes */ - /* the following two structures have similar usage but the later is meant to be a small as possible. they may - * be merged into a single structure in a later version of this component. */ module->region_size = module->selected_btl->btl_registration_handle_size + sizeof (ompi_osc_rdma_region_t); module->state_size = sizeof (ompi_osc_rdma_state_t);