diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 3d9cade059..a9b6ba0242 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -133,7 +133,8 @@ struct ompi_osc_rdma_module_t { opal_list_t m_queued_sendreqs; /** start sending data eagerly */ - bool m_eager_send; + bool m_eager_send_active; + bool m_eager_send_ok; /* ********************* FENCE data ************************ */ /* an array of ints, each containing the value diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index b68f660bde..0778ecea3a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -50,9 +50,10 @@ ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count, { int ret; ompi_osc_rdma_sendreq_t *sendreq; + ompi_osc_rdma_module_t *module = GET_MODULE(win); if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!GET_MODULE(win)->m_sc_remote_active_ranks[target])) { + (!module->m_sc_remote_active_ranks[target])) { return MPI_ERR_RMA_SYNC; } @@ -84,14 +85,32 @@ ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count, target_disp, target_count, target_dt, - GET_MODULE(win), + module, &sendreq); if (OMPI_SUCCESS != ret) return ret; sendreq->req_op_id = op->o_f_to_c_index; - /* enqueue sendreq */ - ret = enqueue_sendreq(GET_MODULE(win), sendreq); + if (0 && module->m_eager_send_active) { + OPAL_THREAD_LOCK(&module->m_lock); + sendreq->req_module->m_num_pending_out += 1; + module->m_num_pending_sendreqs[sendreq->req_target_rank] += 1; + OPAL_THREAD_UNLOCK(&(module->m_lock)); + + ret = ompi_osc_rdma_sendreq_send(module, sendreq); + + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_LOCK(&module->m_lock); + sendreq->req_module->m_num_pending_out -= 1; + opal_list_append(&(module->m_pending_sendreqs), + (opal_list_item_t*) sendreq); + OPAL_THREAD_UNLOCK(&module->m_lock); + ret = OMPI_SUCCESS; + } + } else { + /* enqueue sendreq */ + ret = enqueue_sendreq(module, sendreq); + } return ret; } @@ -140,10 +159,7 @@ ompi_osc_rdma_module_get(void *origin_addr, &sendreq); if (OMPI_SUCCESS != ret) return ret; - /* if we're doing fence synchronization, try to actively send - right now */ - if (module->m_eager_send && - (OMPI_WIN_FENCE & ompi_win_get_mode(win))) { + if (module->m_eager_send_active) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out += 1; module->m_num_pending_sendreqs[sendreq->req_target_rank] += 1; @@ -156,7 +172,7 @@ ompi_osc_rdma_module_get(void *origin_addr, sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), (opal_list_item_t*) sendreq); - OPAL_THREAD_LOCK(&module->m_lock); + OPAL_THREAD_UNLOCK(&module->m_lock); ret = OMPI_SUCCESS; } } else { @@ -209,8 +225,7 @@ ompi_osc_rdma_module_put(void *origin_addr, int origin_count, /* if we're doing fence synchronization, try to actively send right now */ - if (module->m_eager_send && - (OMPI_WIN_FENCE & ompi_win_get_mode(win))) { + if (module->m_eager_send_active) { OPAL_THREAD_LOCK(&module->m_lock); sendreq->req_module->m_num_pending_out += 1; module->m_num_pending_sendreqs[sendreq->req_target_rank] += 1; @@ -223,7 +238,7 @@ ompi_osc_rdma_module_put(void *origin_addr, int origin_count, sendreq->req_module->m_num_pending_out -= 1; opal_list_append(&(module->m_pending_sendreqs), (opal_list_item_t*) sendreq); - OPAL_THREAD_LOCK(&module->m_lock); + OPAL_THREAD_UNLOCK(&module->m_lock); ret = OMPI_SUCCESS; } } else { diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 2172143387..56b1bd8cee 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -322,7 +322,11 @@ ompi_osc_rdma_component_select(ompi_win_t *win, memset(module->m_num_pending_sendreqs, 0, sizeof(unsigned int) * ompi_comm_size(module->m_comm)); - module->m_eager_send = check_config_value_bool("eager_send", info); + module->m_eager_send_ok = check_config_value_bool("eager_send", info); + /* initially, we're in that pseudo-fence state, so we allow eager + sends (yay for Fence). Other protocols will disable before + they start their epochs, so this isn't a problem. */ + module->m_eager_send_active = module->m_eager_send_ok; /* fence data */ module->m_fence_coll_counts = (int*) @@ -623,7 +627,39 @@ component_fragment_cb(struct mca_btl_base_module_t *btl, OPAL_THREAD_LOCK(&module->m_lock); count = (module->m_num_post_msgs -= 1); OPAL_THREAD_UNLOCK(&module->m_lock); - if (count == 0) opal_condition_broadcast(&module->m_cond); + if (count == 0) { + module->m_eager_send_active = module->m_eager_send_ok; + + while (module->m_eager_send_active && + opal_list_get_size(&module->m_pending_sendreqs)) { + ompi_osc_rdma_sendreq_t *sendreq; + + OPAL_THREAD_LOCK(&module->m_lock); + sendreq = (ompi_osc_rdma_sendreq_t*) + opal_list_remove_first(&module->m_pending_sendreqs); + + if (NULL == sendreq) { + OPAL_THREAD_UNLOCK(&module->m_lock); + break; + } + + sendreq->req_module->m_num_pending_out += 1; + OPAL_THREAD_UNLOCK(&module->m_lock); + + ret = ompi_osc_rdma_sendreq_send(module, sendreq); + + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_LOCK(&module->m_lock); + sendreq->req_module->m_num_pending_out -= 1; + opal_list_append(&(module->m_pending_sendreqs), + (opal_list_item_t*) sendreq); + OPAL_THREAD_UNLOCK(&module->m_lock); + break; + } + } + + opal_condition_broadcast(&module->m_cond); + } } break; case OMPI_OSC_RDMA_HDR_COMPLETE: diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.c b/ompi/mca/osc/rdma/osc_rdma_sync.c index 04a171b94a..c5e984a6b9 100644 --- a/ompi/mca/osc/rdma/osc_rdma_sync.c +++ b/ompi/mca/osc/rdma/osc_rdma_sync.c @@ -168,11 +168,15 @@ ompi_osc_rdma_module_start(ompi_group_t *group, { int i, ret = OMPI_SUCCESS; ompi_osc_rdma_module_t *module = GET_MODULE(win); + int32_t count; OBJ_RETAIN(group); ompi_group_increment_proc_count(group); + module->m_eager_send_active = false; + OPAL_THREAD_LOCK(&module->m_lock); + if (NULL != module->m_sc_group) { OPAL_THREAD_UNLOCK(&module->m_lock); ret = MPI_ERR_RMA_SYNC; @@ -182,7 +186,7 @@ ompi_osc_rdma_module_start(ompi_group_t *group, /* possible we've already received a couple in messages, so add however many we're going to wait for */ - module->m_num_post_msgs += ompi_group_size(module->m_sc_group); + count = (module->m_num_post_msgs += ompi_group_size(module->m_sc_group)); OPAL_THREAD_UNLOCK(&(module->m_lock)); memset(module->m_sc_remote_active_ranks, 0, @@ -221,6 +225,10 @@ ompi_osc_rdma_module_start(ompi_group_t *group, ompi_win_remove_mode(win, OMPI_WIN_FENCE); ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); + if (count == 0) { + module->m_eager_send_active = module->m_eager_send_ok; + } + return OMPI_SUCCESS; clean: @@ -249,12 +257,8 @@ ompi_osc_rdma_module_complete(ompi_win_t *win) /* for each process in group, send a control message with number of updates coming, then start all the requests */ - for (i = 0 ; i < ompi_group_size(module->m_sc_group) ; ++i) { - int comm_rank = module->m_sc_remote_ranks[i]; - - module->m_num_pending_out += - module->m_copy_num_pending_sendreqs[comm_rank]; - } + module->m_num_pending_out += + (int32_t) opal_list_get_size(&module->m_copy_pending_sendreqs); OPAL_THREAD_UNLOCK(&module->m_lock); for (i = 0 ; i < ompi_group_size(module->m_sc_group) ; ++i) { @@ -390,13 +394,13 @@ ompi_osc_rdma_module_test(ompi_win_t *win, *flag = 1; - ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - OPAL_THREAD_LOCK(&(module->m_lock)); group = module->m_pw_group; module->m_pw_group = NULL; OPAL_THREAD_UNLOCK(&(module->m_lock)); + ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); + ompi_group_decrement_proc_count(group); OBJ_RELEASE(group); @@ -439,6 +443,8 @@ ompi_osc_rdma_module_lock(int lock_type, module->m_comm->c_my_rank, lock_type); + module->m_eager_send_active = false; + /* return */ return OMPI_SUCCESS; } @@ -517,6 +523,8 @@ ompi_osc_rdma_module_unlock(int target, /* set our mode on the window */ ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); + module->m_eager_send_active = module->m_eager_send_ok; + return OMPI_SUCCESS; }