From f5c721d11c64405b0ccee1ce8d6dec9ba65c3ae5 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 2 Jul 2007 22:22:59 +0000 Subject: [PATCH] Wire up all the RDMA-capable BTLs. Still no RDMA communication, but the datastructures are finally all there This commit was SVN r15271. --- ompi/mca/osc/rdma/osc_rdma.c | 52 ++- ompi/mca/osc/rdma/osc_rdma.h | 51 +++ ompi/mca/osc/rdma/osc_rdma_component.c | 463 ++++++++++++++++++++++++- ompi/mca/osc/rdma/osc_rdma_header.h | 55 ++- 4 files changed, 598 insertions(+), 23 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.c b/ompi/mca/osc/rdma/osc_rdma.c index 2ccab8aaca..62701b2e0f 100644 --- a/ompi/mca/osc/rdma/osc_rdma.c +++ b/ompi/mca/osc/rdma/osc_rdma.c @@ -34,7 +34,7 @@ int ompi_osc_rdma_module_free(ompi_win_t *win) { int ret = OMPI_SUCCESS; - int tmp; + int tmp, i; ompi_osc_rdma_module_t *module = GET_MODULE(win); opal_output_verbose(1, ompi_osc_base_output, @@ -92,9 +92,57 @@ ompi_osc_rdma_module_free(ompi_win_t *win) if (NULL != module->m_num_pending_sendreqs) { free(module->m_num_pending_sendreqs); } + if (NULL != module->m_peer_info) { + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]); + } + free(module->m_peer_info); + } if (NULL != module->m_comm) ompi_comm_free(&module->m_comm); - if (NULL != module) free(module); return ret; } + + +int +ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info) +{ + int i; + + if (NULL != peer_info->peer_btls) { + free(peer_info->peer_btls); + } + + if (NULL != peer_info->local_descriptors) { + for (i = 0 ; i < peer_info->local_num_btls ; ++i) { + if (NULL != peer_info->local_descriptors[i]) { + mca_bml_base_btl_t *bml_btl = + peer_info->local_btls[i]; + bml_btl->btl_free(bml_btl->btl, + peer_info->local_descriptors[i]); + } + } + free(peer_info->local_descriptors); + } + + if (NULL != peer_info->local_registrations) { + for (i = 0 ; i < peer_info->local_num_btls ; ++i) { + if (NULL != peer_info->local_registrations[i]) { + mca_mpool_base_module_t *module = + peer_info->local_registrations[i]->mpool; + module->mpool_deregister(module, + peer_info->local_registrations[i]); + } + } + free(peer_info->local_registrations); + } + + if (NULL != peer_info->local_btls) { + free(peer_info->local_btls); + } + + memset(peer_info, 0, sizeof(ompi_osc_rdma_peer_info_t)); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index a9b6ba0242..ab72132290 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -27,6 +27,7 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/osc/osc.h" #include "ompi/mca/btl/btl.h" +#include "ompi/mca/bml/bml.h" BEGIN_C_DECLS @@ -67,14 +68,51 @@ struct ompi_osc_rdma_component_t { #endif bool c_btl_registered; + + uint32_t c_sequence_number; }; typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t; +struct ompi_osc_rdma_btl_t { + uint64_t peer_seg_key; + mca_bml_base_btl_t *bml_btl; + int rdma_order; +}; +typedef struct ompi_osc_rdma_btl_t ompi_osc_rdma_btl_t; + + +struct ompi_osc_rdma_peer_info_t { + uint64_t peer_base; + uint64_t peer_len; + + int peer_num_btls; + volatile int peer_index_btls; + ompi_osc_rdma_btl_t *peer_btls; + + int local_num_btls; + mca_bml_base_btl_t **local_btls; + mca_mpool_base_registration_t **local_registrations; + mca_btl_base_descriptor_t **local_descriptors; +}; +typedef struct ompi_osc_rdma_peer_info_t ompi_osc_rdma_peer_info_t; + + +struct ompi_osc_rdma_setup_info_t { + volatile int32_t num_btls_callin; + int32_t num_btls_expected; + volatile int32_t num_btls_outgoing; + opal_list_t *outstanding_btl_requests; +}; +typedef struct ompi_osc_rdma_setup_info_t ompi_osc_rdma_setup_info_t; + + struct ompi_osc_rdma_module_t { /** Extend the basic osc module interface */ ompi_osc_base_module_t super; + uint32_t m_sequence_number; + /** lock access to data structures in the current module */ opal_mutex_t m_lock; @@ -136,6 +174,15 @@ struct ompi_osc_rdma_module_t { bool m_eager_send_active; bool m_eager_send_ok; + /* RDMA data */ + bool m_use_rdma; + ompi_osc_rdma_setup_info_t *m_setup_info; + ompi_osc_rdma_peer_info_t *m_peer_info; + + int32_t m_num_pending_rdma; + + volatile int32_t m_num_complete_rdma; + /* ********************* FENCE data ************************ */ /* an array of ints, each containing the value 1. */ @@ -157,6 +204,7 @@ struct ompi_osc_rdma_module_t { typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t; OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component; + #define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module) /* @@ -178,6 +226,8 @@ int ompi_osc_rdma_component_select(struct ompi_win_t *win, int ompi_osc_rdma_component_progress(void); +int ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info); + /* * Module interface function types */ @@ -248,6 +298,7 @@ int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module, int ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module); + END_C_DECLS #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index b9a1bea277..1ba8e7497d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -47,6 +47,7 @@ static void component_fragment_cb(struct mca_btl_base_module_t *btl, #if OMPI_ENABLE_PROGRESS_THREADS static void* component_thread_fn(opal_object_t *obj); #endif +static int setup_rdma(ompi_osc_rdma_module_t *module); ompi_osc_rdma_component_t mca_osc_rdma_component = { { /* ompi_osc_base_component_t */ @@ -143,6 +144,12 @@ component_open(void) "Info key of same name overrides this value.", false, false, 1, NULL); + mca_base_param_reg_int(&mca_osc_rdma_component.super.osc_version, + "use_rdma", + "Use real RDMA operations to transfer data. " + "Info key of same name overrides this value.", + false, false, 0, NULL); + mca_base_param_reg_int(&mca_osc_rdma_component.super.osc_version, "no_locks", "Enable optimizations available only if MPI_LOCK is " @@ -204,6 +211,8 @@ ompi_osc_rdma_component_init(bool enable_progress_threads, mca_osc_rdma_component.c_btl_registered = false; + mca_osc_rdma_component.c_sequence_number = 0; + return OMPI_SUCCESS; } @@ -292,6 +301,10 @@ ompi_osc_rdma_component_select(ompi_win_t *win, module->m_win = win; + OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); + module->m_sequence_number = (mca_osc_rdma_component.c_sequence_number++); + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); + ret = ompi_comm_dup(comm, &module->m_comm, 0); if (ret != OMPI_SUCCESS) goto cleanup; @@ -329,6 +342,11 @@ ompi_osc_rdma_component_select(ompi_win_t *win, they start their epochs, so this isn't a problem. */ module->m_eager_send_active = module->m_eager_send_ok; + /* allocate space for rdma information */ + module->m_use_rdma = check_config_value_bool("use_rdma", info); + module->m_setup_info = NULL; + module->m_peer_info = NULL; + /* fence data */ module->m_fence_coll_counts = (int*) malloc(sizeof(int) * ompi_comm_size(module->m_comm)); @@ -388,9 +406,6 @@ ompi_osc_rdma_component_select(ompi_win_t *win, win->w_flags |= OMPI_WIN_NO_LOCKS; } - /* sync memory - make sure all initialization completed */ - opal_atomic_mb(); - /* register to receive fragment callbacks, if not already done */ OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); if (!mca_osc_rdma_component.c_btl_registered) { @@ -402,9 +417,18 @@ ompi_osc_rdma_component_select(ompi_win_t *win, OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); if (OMPI_SUCCESS != ret) goto cleanup; - /* need to make create a collective, or lock requests can come in - before the window is fully created... */ - module->m_comm->c_coll.coll_barrier(module->m_comm); + /* sync memory - make sure all initialization completed */ + opal_atomic_mb(); + + if (module->m_use_rdma) { + /* fill in rdma information - involves barrier semantics */ + ret = setup_rdma(module); + } else { + /* barrier to prevent arrival of lock requests before we're + fully created */ + ret = module->m_comm->c_coll.coll_barrier(module->m_comm); + } + if (OMPI_SUCCESS != ret) goto cleanup; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "done creating window %d", module->m_comm->c_contextid)); @@ -436,8 +460,13 @@ ompi_osc_rdma_component_select(ompi_win_t *win, if (NULL != module->m_num_pending_sendreqs) { free(module->m_num_pending_sendreqs); } + if (NULL != module->m_peer_info) { + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]); + } + free(module->m_peer_info); + } if (NULL != module->m_comm) ompi_comm_free(&module->m_comm); - if (NULL != module) free(module); return ret; @@ -766,6 +795,55 @@ component_fragment_cb(struct mca_btl_base_module_t *btl, } break; + case OMPI_OSC_RDMA_HDR_RDMA_INFO: + { + ompi_osc_rdma_rdma_info_header_t *header = + (ompi_osc_rdma_rdma_info_header_t*) + descriptor->des_dst[0].seg_addr.pval; + ompi_proc_t *proc = NULL; + mca_bml_base_endpoint_t *endpoint = NULL; + mca_bml_base_btl_t *bml_btl; + ompi_osc_rdma_btl_t *rdma_btl; + int origin, index; + +#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { + OMPI_OSC_RDMA_RDMA_INFO_HDR_NTOH(*header); + } +#endif + + /* get our module pointer */ + module = ompi_osc_rdma_windx_to_module(header->hdr_windx); + if (NULL == module) return; + + origin = header->hdr_origin; + + /* find the bml_btl */ + proc = ompi_comm_peer_lookup(module->m_comm, origin); + endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; + bml_btl = mca_bml_base_btl_array_find(&endpoint->btl_rdma, btl); + if (NULL == bml_btl) { + opal_output(ompi_osc_base_output, + "received rdma info for unknown btl from rank %d", + origin); + return; + } + + OPAL_THREAD_LOCK(&module->m_lock); + index = module->m_peer_info[origin].peer_num_btls++; + rdma_btl = &(module->m_peer_info[origin].peer_btls[index]); + + rdma_btl->peer_seg_key = header->hdr_segkey; + rdma_btl->bml_btl = bml_btl; + rdma_btl->rdma_order = MCA_BTL_NO_ORDER; + + module->m_setup_info->num_btls_callin++; + OPAL_THREAD_UNLOCK(&module->m_lock); + + opal_condition_broadcast(&module->m_cond); + } + break; + default: /* BWB - FIX ME - this sucks */ opal_output(ompi_osc_base_output, @@ -844,3 +922,374 @@ component_thread_fn(opal_object_t *obj) return NULL; } #endif + + +/*********** RDMA setup stuff ***********/ + + +struct peer_rdma_send_info_t{ + opal_list_item_t super; + ompi_osc_rdma_module_t *module; + ompi_proc_t *proc; + mca_bml_base_btl_t *bml_btl; + uint64_t seg_key; +}; +typedef struct peer_rdma_send_info_t peer_rdma_send_info_t; +OBJ_CLASS_INSTANCE(peer_rdma_send_info_t, opal_list_item_t, NULL, NULL); + + +static void +rdma_send_info_send_complete(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t *endpoint, + struct mca_btl_base_descriptor_t* descriptor, + int status) +{ + peer_rdma_send_info_t *peer_send_info = + (peer_rdma_send_info_t*) descriptor->des_cbdata; + + if (OMPI_SUCCESS == status) { + btl->btl_free(btl, descriptor); + + OPAL_THREAD_LOCK(&peer_send_info->module->m_lock); + peer_send_info->module->m_setup_info->num_btls_outgoing--; + OPAL_THREAD_UNLOCK(&peer_send_info->module->m_lock); + + opal_condition_broadcast(&(peer_send_info->module->m_cond)); + + OBJ_RELEASE(peer_send_info); + } else { + /* BWB - fix me */ + abort(); + } +} + +static int +rdma_send_info_send(ompi_osc_rdma_module_t *module, + peer_rdma_send_info_t *peer_send_info) +{ + int ret = OMPI_SUCCESS; + mca_bml_base_btl_t *bml_btl = NULL; + mca_btl_base_descriptor_t *descriptor = NULL; + ompi_osc_rdma_rdma_info_header_t *header = NULL; + + bml_btl = peer_send_info->bml_btl; + descriptor = bml_btl->btl_alloc(bml_btl->btl, + MCA_BTL_NO_ORDER, + sizeof(ompi_osc_rdma_rdma_info_header_t)); + if (NULL == descriptor) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + + /* verify at least enough space for header */ + if (descriptor->des_src[0].seg_len < sizeof(ompi_osc_rdma_rdma_info_header_t)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + /* setup descriptor */ + descriptor->des_cbfunc = rdma_send_info_send_complete; + descriptor->des_cbdata = peer_send_info; + descriptor->des_flags = MCA_BTL_DES_FLAGS_PRIORITY; + descriptor->des_src[0].seg_len = sizeof(ompi_osc_rdma_rdma_info_header_t); + + /* pack header */ + header = (ompi_osc_rdma_rdma_info_header_t*) descriptor->des_src[0].seg_addr.pval; + header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_RDMA_INFO; + header->hdr_segkey = peer_send_info->seg_key; + header->hdr_origin = ompi_comm_rank(module->m_comm); + header->hdr_windx = module->m_comm->c_contextid; + +#ifdef WORDS_BIGENDIAN + header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; +#elif OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if (peer_send_info->proc->proc_arch & OMPI_ARCH_ISBIGENDIAN) { + header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; + OMPI_OSC_RDMA_RDMA_INFO_HDR_HTON(*header); + } +#endif + + /* send fragment */ + ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA); + goto done; + + cleanup: + if (descriptor != NULL) { + mca_bml_base_free(bml_btl, descriptor); + } + + done: + return ret; +} + + +static bool +is_valid_rdma(mca_bml_base_btl_t *bml_btl) +{ + if (((bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) != 0) && + ((bml_btl->btl_flags & MCA_BTL_FLAGS_GET) != 0)) { + return true; + } + + return false; +} + + +static int +setup_rdma(ompi_osc_rdma_module_t *module) +{ + + uint64_t local; + uint64_t *remote = NULL; + MPI_Datatype ui64_type; + int ret = OMPI_SUCCESS; + int i; + +#if SIZEOF_LONG == 8 + ui64_type = MPI_LONG; +#else + ui64_type = MPI_LONG_LONG; +#endif + + /* create a setup info structure */ + module->m_setup_info = malloc(sizeof(ompi_osc_rdma_setup_info_t)); + if (NULL == module->m_setup_info) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + module->m_setup_info->num_btls_callin = 0; + module->m_setup_info->num_btls_expected = -1; + module->m_setup_info->num_btls_outgoing = 0; + module->m_setup_info->outstanding_btl_requests = + malloc(sizeof(opal_list_t) * ompi_comm_size(module->m_comm)); + if (NULL == module->m_setup_info->outstanding_btl_requests) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + OBJ_CONSTRUCT(&(module->m_setup_info->outstanding_btl_requests[i]), + opal_list_t); + } + + /* create peer info array */ + module->m_peer_info = (ompi_osc_rdma_peer_info_t*) + malloc(sizeof(ompi_osc_rdma_peer_info_t) * + ompi_comm_size(module->m_comm)); + if (NULL == module->m_peer_info) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + memset(module->m_peer_info, 0, + sizeof(ompi_osc_rdma_peer_info_t) * ompi_comm_size(module->m_comm)); + + /* get number of btls to each peer, descriptors for the window for + each peer */ + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + ompi_proc_t *proc = ompi_comm_peer_lookup(module->m_comm, i); + ompi_osc_rdma_peer_info_t *peer_info = &module->m_peer_info[i]; + mca_bml_base_endpoint_t *endpoint = + (mca_bml_base_endpoint_t*) proc->proc_bml; + int num_avail = + mca_bml_base_btl_array_get_size(&endpoint->btl_rdma); + size_t j, size; + ompi_convertor_t convertor; + + /* skip peer if heterogeneous */ + if (ompi_proc_local()->proc_arch != proc->proc_arch) { + continue; + } + + /* get a rough estimation of how many BTLs we'll be able to + use, and exit if the answer is none */ + for (j = 0 ; + j < mca_bml_base_btl_array_get_size(&endpoint->btl_rdma) ; + ++j) { + mca_bml_base_btl_t *bml_btl = + mca_bml_base_btl_array_get_index(&endpoint->btl_rdma, j); + if (!is_valid_rdma(bml_btl)) num_avail--; + } + if (0 == num_avail) continue; + + /* Allocate space for all the useable BTLs. They might not + all end up useable, if we can't pin memory for the btl or + the like. But the number of elements to start with should + be small and the number that fail the pin test should be + approximately 0, so this isn't too big of a waste */ + peer_info->peer_btls = (ompi_osc_rdma_btl_t*) + malloc(sizeof(ompi_osc_rdma_btl_t) * num_avail); + peer_info->local_btls = (mca_bml_base_btl_t**) + malloc(sizeof(mca_bml_base_btl_t*) * num_avail); + peer_info->local_registrations = (mca_mpool_base_registration_t**) + malloc(sizeof(mca_mpool_base_registration_t*) * num_avail); + peer_info->local_descriptors = (mca_btl_base_descriptor_t**) + malloc(sizeof(mca_btl_base_descriptor_t*) * num_avail); + if (NULL == peer_info->peer_btls || + NULL == peer_info->local_btls || + NULL == peer_info->local_registrations || + NULL == peer_info->local_descriptors) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + memset(peer_info->peer_btls, 0, + sizeof(ompi_osc_rdma_btl_t) * num_avail); + memset(peer_info->local_registrations, 0, + sizeof(mca_mpool_base_registration_t*) * num_avail); + memset(peer_info->local_descriptors, 0, + sizeof(mca_btl_base_descriptor_t*) * num_avail); + + OBJ_CONSTRUCT(&convertor, ompi_convertor_t); + + /* Find all useable btls, try to do the descriptor thing for + them, and store all that information */ + for (j = 0 ; + j < mca_bml_base_btl_array_get_size(&endpoint->btl_rdma) ; + ++j) { + mca_bml_base_btl_t *bml_btl = + mca_bml_base_btl_array_get_index(&endpoint->btl_rdma, j); + mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool; + int index = peer_info->local_num_btls; + + if (!is_valid_rdma(bml_btl)) continue; + + if (NULL != btl_mpool) { + ret = btl_mpool->mpool_register(btl_mpool, module->m_win->w_baseptr, + module->m_win->w_size, 0, + &(peer_info->local_registrations[index])); + if (OMPI_SUCCESS != ret) continue; + } else { + peer_info->local_registrations[index] = NULL; + } + + size = module->m_win->w_size; + + ompi_convertor_copy_and_prepare_for_send(proc->proc_convertor, + MPI_BYTE, + module->m_win->w_size, + module->m_win->w_baseptr, + 0, + &convertor); + + peer_info->local_descriptors[index] = + bml_btl->btl_prepare_dst(bml_btl->btl, + bml_btl->btl_endpoint, + peer_info->local_registrations[index], + &convertor, + MCA_BTL_NO_ORDER, + 0, + &size); + if (NULL == peer_info->local_descriptors[index]) { + if (NULL != peer_info->local_registrations[index]) { + btl_mpool->mpool_deregister(btl_mpool, + peer_info->local_registrations[index]); + } + ompi_convertor_cleanup(&convertor); + continue; + } + + peer_info->local_btls[index] = bml_btl; + + ompi_convertor_cleanup(&convertor); + + peer_info->local_num_btls++; + module->m_setup_info->num_btls_outgoing++; + } + + OBJ_DESTRUCT(&convertor); + } + + /* fill in information about remote peers */ + remote = malloc(sizeof(uint64_t) * ompi_comm_size(module->m_comm)); + if (NULL == remote) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + + local = ompi_ptr_ptol(module->m_win->w_baseptr); + ret = module->m_comm->c_coll.coll_allgather(&local, 1, ui64_type, + remote, 1, ui64_type, + module->m_comm); + if (OMPI_SUCCESS != ret) goto cleanup; + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + module->m_peer_info[i].peer_base = remote[i]; + } + + local = module->m_win->w_size; + ret = module->m_comm->c_coll.coll_allgather(&local, 1, ui64_type, + remote, 1, ui64_type, + module->m_comm); + if (OMPI_SUCCESS != ret) goto cleanup; + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + module->m_peer_info[i].peer_base = remote[i]; + } + + /* get number of btls we're expecting from everyone */ + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + remote[i] = module->m_peer_info[i].local_num_btls; + } + ret = module->m_comm->c_coll.coll_reduce_scatter(remote, + &local, + module->m_fence_coll_counts, + ui64_type, + MPI_SUM, + module->m_comm); + if (OMPI_SUCCESS != ret) goto cleanup; + module->m_setup_info->num_btls_expected = local; + /* end fill in information about remote peers */ + + /* send our contact info to everyone... */ + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + ompi_osc_rdma_peer_info_t *peer_info = &module->m_peer_info[i]; + int j; + + for (j = 0 ; j < peer_info->local_num_btls ; ++j) { + peer_rdma_send_info_t *peer_send_info = + OBJ_NEW(peer_rdma_send_info_t); + peer_send_info->module = module; + peer_send_info->proc = ompi_comm_peer_lookup(module->m_comm, i); + peer_send_info->bml_btl = peer_info->local_btls[j]; + peer_send_info->seg_key = + peer_info->local_descriptors[j]->des_dst[0].seg_key.key64; + + ret = rdma_send_info_send(module, peer_send_info); + if (OMPI_SUCCESS != ret) { + opal_list_append(&(module->m_setup_info->outstanding_btl_requests[i]), + &peer_send_info->super); + } + } + } + + OPAL_THREAD_LOCK(&module->m_lock); + while ((module->m_setup_info->num_btls_outgoing != 0) || + (module->m_setup_info->num_btls_expected != + module->m_setup_info->num_btls_callin)) { + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + peer_rdma_send_info_t *peer_send_info = + (peer_rdma_send_info_t*) opal_list_remove_first(&module->m_setup_info->outstanding_btl_requests[i]); + if (NULL != peer_send_info) { + ret = rdma_send_info_send(module, peer_send_info); + if (OMPI_SUCCESS != ret) { + opal_list_append(&(module->m_setup_info->outstanding_btl_requests[i]), + &peer_send_info->super); + } + } + } + opal_condition_wait(&module->m_cond, &module->m_lock); + } + OPAL_THREAD_UNLOCK(&module->m_lock); + + ret = OMPI_SUCCESS; + + cleanup: + if (NULL != module->m_setup_info) { + if (NULL != module->m_setup_info->outstanding_btl_requests) { + for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { + OBJ_DESTRUCT(&(module->m_setup_info->outstanding_btl_requests[i])); + } + free(module->m_setup_info->outstanding_btl_requests); + } + free(module->m_setup_info); + } + if (NULL != remote) free(remote); + + return ret; +} diff --git a/ompi/mca/osc/rdma/osc_rdma_header.h b/ompi/mca/osc/rdma/osc_rdma_header.h index b86042db66..d0b7c12bc9 100644 --- a/ompi/mca/osc/rdma/osc_rdma_header.h +++ b/ompi/mca/osc/rdma/osc_rdma_header.h @@ -25,17 +25,18 @@ #include "opal/types.h" -#define OMPI_OSC_RDMA_HDR_PUT 0x0001 -#define OMPI_OSC_RDMA_HDR_ACC 0x0002 -#define OMPI_OSC_RDMA_HDR_GET 0x0003 -#define OMPI_OSC_RDMA_HDR_REPLY 0x0004 -#define OMPI_OSC_RDMA_HDR_POST 0x0005 -#define OMPI_OSC_RDMA_HDR_COMPLETE 0x0006 -#define OMPI_OSC_RDMA_HDR_LOCK_REQ 0x0007 -#define OMPI_OSC_RDMA_HDR_UNLOCK_REQ 0x0008 -#define OMPI_OSC_RDMA_HDR_UNLOCK_REPLY 0x0009 +#define OMPI_OSC_RDMA_HDR_PUT 0x01 +#define OMPI_OSC_RDMA_HDR_ACC 0x02 +#define OMPI_OSC_RDMA_HDR_GET 0x03 +#define OMPI_OSC_RDMA_HDR_REPLY 0x04 +#define OMPI_OSC_RDMA_HDR_POST 0x05 +#define OMPI_OSC_RDMA_HDR_COMPLETE 0x06 +#define OMPI_OSC_RDMA_HDR_LOCK_REQ 0x07 +#define OMPI_OSC_RDMA_HDR_UNLOCK_REQ 0x08 +#define OMPI_OSC_RDMA_HDR_UNLOCK_REPLY 0x09 +#define OMPI_OSC_RDMA_HDR_RDMA_INFO 0x0A -#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x0001 +#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01 struct ompi_osc_rdma_base_header_t { uint8_t hdr_type; @@ -122,18 +123,44 @@ typedef struct ompi_osc_rdma_control_header_t ompi_osc_rdma_control_header_t; #define OMPI_OSC_RDMA_CONTROL_HDR_HTON(hdr) \ do { \ - OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base) \ - (hdr).hdr_windx = htons((hdr).hdr_windx); \ + OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base); \ + (hdr).hdr_windx = htons((hdr).hdr_windx); \ (hdr).hdr_value[0] = htonl((hdr).hdr_value[0]); \ (hdr).hdr_value[1] = htonl((hdr).hdr_value[1]); \ } while (0) #define OMPI_OSC_RDMA_CONTROL_HDR_NTOH(hdr) \ do { \ - OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base) \ - (hdr).hdr_windx = ntohs((hdr).hdr_windx); \ + OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base); \ + (hdr).hdr_windx = ntohs((hdr).hdr_windx); \ (hdr).hdr_value[0] = ntohl((hdr).hdr_value[0]); \ (hdr).hdr_value[1] = ntohl((hdr).hdr_value[1]); \ } while (0) + +struct ompi_osc_rdma_rdma_info_header_t { + ompi_osc_rdma_base_header_t hdr_base; + int16_t hdr_windx; + int32_t hdr_origin; + uint64_t hdr_segkey; +}; +typedef struct ompi_osc_rdma_rdma_info_header_t ompi_osc_rdma_rdma_info_header_t; + +#define OMPI_OSC_RDMA_RDMA_INFO_HDR_HTON(hdr) \ + do { \ + OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base); \ + (hdr).hdr_windx = htons((hdr).hdr_windx); \ + (hdr).hdr_origin = htonl((hdr).hdr_origin); \ + (hdr).hdr_segkey = hton64((hdr).hdr_segkey); \ + } while (0) + +#define OMPI_OSC_RDMA_RDMA_INFO_HDR_NTOH(hdr) \ + do { \ + OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base); \ + (hdr).hdr_windx = ntohs((hdr).hdr_windx); \ + (hdr).hdr_origin = ntohl((hdr).hdr_origin); \ + (hdr).hdr_segkey = ntoh64((hdr).hdr_segkey); \ + } while (0) + + #endif /* OMPI_MCA_OSC_RDMA_HDR_H */