From 0c25f7be092348ad8691ac93b774341810dc4252 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 27 Nov 2006 21:41:29 +0000 Subject: [PATCH] More One-sided fixes: * Fix a counter roll-over issue that could result from a large (but not excessive) number of outstanding put/get/accumulate calls during a single synchronization issues (Refs trac:506) * Fix epoch issue with rdma component that would effect PWSC synchronization (Refs trac:507) This commit was SVN r12673. The following Trac tickets were found above: Ticket 506 --> https://svn.open-mpi.org/trac/ompi/ticket/506 Ticket 507 --> https://svn.open-mpi.org/trac/ompi/ticket/507 --- ompi/mca/osc/pt2pt/osc_pt2pt.h | 10 +++---- ompi/mca/osc/pt2pt/osc_pt2pt_component.c | 10 +++---- ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c | 2 +- ompi/mca/osc/pt2pt/osc_pt2pt_sync.c | 22 +++++++++------- ompi/mca/osc/rdma/osc_rdma.h | 10 +++---- ompi/mca/osc/rdma/osc_rdma_component.c | 10 +++---- ompi/mca/osc/rdma/osc_rdma_data_move.c | 23 ++++++++++++++++- ompi/mca/osc/rdma/osc_rdma_sync.c | 33 ++++++++++++++---------- 8 files changed, 76 insertions(+), 44 deletions(-) diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt.h b/ompi/mca/osc/pt2pt/osc_pt2pt.h index 2f85553265..2475959fe0 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt.h +++ b/ompi/mca/osc/pt2pt/osc_pt2pt.h @@ -87,10 +87,10 @@ struct ompi_osc_pt2pt_module_t { started. p2p_lock must be held when modifying this field. */ opal_list_t p2p_pending_sendreqs; - /** list of int16_t counters for the number of requests to a + /** list of unsigned int counters for the number of requests to a particular rank in p2p_comm for this access epoc. p2p_lock must be held when modifying this field */ - short *p2p_num_pending_sendreqs; + unsigned int *p2p_num_pending_sendreqs; /** For MPI_Fence synchronization, the number of messages to send in epoch. For Start/Complete, the number of updates for this @@ -124,15 +124,15 @@ struct ompi_osc_pt2pt_module_t { opal_list_t p2p_long_msgs; opal_list_t p2p_copy_pending_sendreqs; - short *p2p_copy_num_pending_sendreqs; + unsigned int *p2p_copy_num_pending_sendreqs; /* ********************* FENCE data ************************ */ /* an array of ints, each containing the value 1. */ int *p2p_fence_coll_counts; - /* an array of shorts, for use in experimenting + /* an array of unsigned ints, for use in experimenting with different synchronization costs */ - short *p2p_fence_coll_results; + unsigned int *p2p_fence_coll_results; /* ********************* PWSC data ************************ */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_component.c b/ompi/mca/osc/pt2pt/osc_pt2pt_component.c index 15f7924fb8..96a22f9cfd 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_component.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_component.c @@ -257,7 +257,7 @@ ompi_osc_pt2pt_component_select(ompi_win_t *win, OBJ_CONSTRUCT(&module->p2p_pending_control_sends, opal_list_t); OBJ_CONSTRUCT(&module->p2p_pending_sendreqs, opal_list_t); - module->p2p_num_pending_sendreqs = (short*)malloc(sizeof(short) * + module->p2p_num_pending_sendreqs = (unsigned int*)malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_num_pending_sendreqs) { OBJ_DESTRUCT(&module->p2p_pending_sendreqs); @@ -268,7 +268,7 @@ ompi_osc_pt2pt_component_select(ompi_win_t *win, return ret; } memset(module->p2p_num_pending_sendreqs, 0, - sizeof(short) * ompi_comm_size(module->p2p_comm)); + sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); module->p2p_num_pending_out = 0; module->p2p_num_pending_in = 0; @@ -279,7 +279,7 @@ ompi_osc_pt2pt_component_select(ompi_win_t *win, OBJ_CONSTRUCT(&(module->p2p_long_msgs), opal_list_t); OBJ_CONSTRUCT(&(module->p2p_copy_pending_sendreqs), opal_list_t); - module->p2p_copy_num_pending_sendreqs = (short*)malloc(sizeof(short) * + module->p2p_copy_num_pending_sendreqs = (unsigned int*)malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_copy_num_pending_sendreqs) { OBJ_DESTRUCT(&module->p2p_copy_pending_sendreqs); @@ -293,7 +293,7 @@ ompi_osc_pt2pt_component_select(ompi_win_t *win, return ret; } memset(module->p2p_num_pending_sendreqs, 0, - sizeof(short) * ompi_comm_size(module->p2p_comm)); + sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); /* fence data */ module->p2p_fence_coll_counts = (int*)malloc(sizeof(int) * @@ -314,7 +314,7 @@ ompi_osc_pt2pt_component_select(ompi_win_t *win, module->p2p_fence_coll_counts[i] = 1; } - module->p2p_fence_coll_results = (short*)malloc(sizeof(short) * + module->p2p_fence_coll_results = (unsigned int*)malloc(sizeof(unsigned short) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_fence_coll_results) { free(module->p2p_fence_coll_counts); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c index 81b67eb9f1..9abee84f96 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c @@ -264,7 +264,7 @@ ompi_osc_pt2pt_sendreq_send(ompi_osc_pt2pt_module_t *module, #endif /* send fragment */ - opal_output_verbose(50, ompi_osc_base_output, + opal_output_verbose(51, ompi_osc_base_output, "%d sending sendreq to %d", sendreq->req_module->p2p_comm->c_my_rank, sendreq->req_target_rank); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c index 3f5f65bda6..0baf8e27bf 100644 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c +++ b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c @@ -63,7 +63,7 @@ ompi_osc_pt2pt_progress_long(ompi_osc_pt2pt_module_t *module) static inline void ompi_osc_pt2pt_flip_sendreqs(ompi_osc_pt2pt_module_t *module) { - short *tmp; + unsigned int *tmp; OPAL_THREAD_LOCK(&(module->p2p_lock)); @@ -72,7 +72,7 @@ ompi_osc_pt2pt_flip_sendreqs(ompi_osc_pt2pt_module_t *module) module->p2p_num_pending_sendreqs; module->p2p_num_pending_sendreqs = tmp; memset(module->p2p_num_pending_sendreqs, 0, - sizeof(short) * ompi_comm_size(module->p2p_comm)); + sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); /* Copy in all the pending requests */ opal_list_join(&module->p2p_copy_pending_sendreqs, @@ -86,7 +86,7 @@ ompi_osc_pt2pt_flip_sendreqs(ompi_osc_pt2pt_module_t *module) int ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win) { - short incoming_reqs; + unsigned int incoming_reqs; int ret = OMPI_SUCCESS, i; if (0 != (assert & MPI_MODE_NOPRECEDE)) { @@ -116,7 +116,7 @@ ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win) c_coll.coll_reduce_scatter(P2P_MODULE(win)->p2p_copy_num_pending_sendreqs, &incoming_reqs, P2P_MODULE(win)->p2p_fence_coll_counts, - MPI_SHORT, + MPI_UNSIGNED, MPI_SUM, P2P_MODULE(win)->p2p_comm); @@ -200,6 +200,9 @@ ompi_osc_pt2pt_module_start(ompi_group_t *group, P2P_MODULE(win)->p2p_sc_group = group; OPAL_THREAD_UNLOCK(&(P2P_MODULE(win)->p2p_lock)); + memset(P2P_MODULE(win)->p2p_sc_remote_active_ranks, 0, + sizeof(bool) * ompi_comm_size(P2P_MODULE(win)->p2p_comm)); + /* for each process in the specified group, find it's rank in our communicator, store those indexes, and set the true / false in the active ranks table */ @@ -263,11 +266,12 @@ ompi_osc_pt2pt_module_complete(ompi_win_t *win) OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_pending_out), P2P_MODULE(win)->p2p_copy_num_pending_sendreqs[comm_rank]); - ompi_osc_pt2pt_control_send(P2P_MODULE(win), - P2P_MODULE(win)->p2p_sc_group->grp_proc_pointers[i], - OMPI_OSC_PT2PT_HDR_COMPLETE, - P2P_MODULE(win)->p2p_copy_num_pending_sendreqs[comm_rank], - 0); + ret = ompi_osc_pt2pt_control_send(P2P_MODULE(win), + P2P_MODULE(win)->p2p_sc_group->grp_proc_pointers[i], + OMPI_OSC_PT2PT_HDR_COMPLETE, + P2P_MODULE(win)->p2p_copy_num_pending_sendreqs[comm_rank], + 0); + assert(ret == OMPI_SUCCESS); } /* try to start all the requests. We've copied everything we diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 924aa38639..9c4ce8a872 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -72,10 +72,10 @@ struct ompi_osc_rdma_module_t { started. p2p_lock must be held when modifying this field. */ opal_list_t p2p_pending_sendreqs; - /** list of int16_t counters for the number of requests to a + /** list of unsigned int counters for the number of requests to a particular rank in p2p_comm for this access epoc. p2p_lock must be held when modifying this field */ - short *p2p_num_pending_sendreqs; + unsigned int *p2p_num_pending_sendreqs; /** For MPI_Fence synchronization, the number of messages to send in epoch. For Start/Complete, the number of updates for this @@ -109,7 +109,7 @@ struct ompi_osc_rdma_module_t { opal_list_t p2p_long_msgs; opal_list_t p2p_copy_pending_sendreqs; - short *p2p_copy_num_pending_sendreqs; + unsigned int *p2p_copy_num_pending_sendreqs; bool p2p_eager_send; @@ -117,9 +117,9 @@ struct ompi_osc_rdma_module_t { /* an array of ints, each containing the value 1. */ int *p2p_fence_coll_counts; - /* an array of shorts, for use in experimenting + /* an array of unsigned ints, for use in experimenting with different synchronization costs */ - short *p2p_fence_coll_results; + unsigned int *p2p_fence_coll_results; mca_osc_fence_sync_t p2p_fence_sync_type; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index db67556c80..385dbc0753 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -264,7 +264,7 @@ ompi_osc_rdma_component_select(ompi_win_t *win, } OBJ_CONSTRUCT(&module->p2p_pending_sendreqs, opal_list_t); - module->p2p_num_pending_sendreqs = (short*)malloc(sizeof(short) * + module->p2p_num_pending_sendreqs = (unsigned int*)malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_num_pending_sendreqs) { OBJ_DESTRUCT(&module->p2p_pending_sendreqs); @@ -275,7 +275,7 @@ ompi_osc_rdma_component_select(ompi_win_t *win, return ret; } memset(module->p2p_num_pending_sendreqs, 0, - sizeof(short) * ompi_comm_size(module->p2p_comm)); + sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); module->p2p_num_pending_out = 0; module->p2p_num_pending_in = 0; @@ -286,7 +286,7 @@ ompi_osc_rdma_component_select(ompi_win_t *win, OBJ_CONSTRUCT(&(module->p2p_long_msgs), opal_list_t); OBJ_CONSTRUCT(&(module->p2p_copy_pending_sendreqs), opal_list_t); - module->p2p_copy_num_pending_sendreqs = (short*)malloc(sizeof(short) * + module->p2p_copy_num_pending_sendreqs = (unsigned int*)malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_copy_num_pending_sendreqs) { OBJ_DESTRUCT(&module->p2p_copy_pending_sendreqs); @@ -300,7 +300,7 @@ ompi_osc_rdma_component_select(ompi_win_t *win, return ret; } memset(module->p2p_num_pending_sendreqs, 0, - sizeof(short) * ompi_comm_size(module->p2p_comm)); + sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); module->p2p_eager_send = check_config_value_bool("eager_send", info); @@ -323,7 +323,7 @@ ompi_osc_rdma_component_select(ompi_win_t *win, module->p2p_fence_coll_counts[i] = 1; } - module->p2p_fence_coll_results = (short*)malloc(sizeof(short) * + module->p2p_fence_coll_results = (unsigned int*)malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); if (NULL == module->p2p_fence_coll_results) { free(module->p2p_fence_coll_counts); diff --git a/ompi/mca/osc/rdma/osc_rdma_data_move.c b/ompi/mca/osc/rdma/osc_rdma_data_move.c index 50ca0577b5..f70e5fd45d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_data_move.c +++ b/ompi/mca/osc/rdma/osc_rdma_data_move.c @@ -92,6 +92,8 @@ ompi_osc_rdma_sendreq_send_cb(struct mca_btl_base_module_t* btl, (ompi_osc_rdma_sendreq_t*) descriptor->des_cbdata; ompi_osc_rdma_send_header_t *header = (ompi_osc_rdma_send_header_t*) descriptor->des_src[0].seg_addr.pval; + opal_list_item_t *item; + ompi_osc_rdma_module_t *module = sendreq->req_module; if (OMPI_SUCCESS != status) { /* requeue and return */ @@ -151,7 +153,26 @@ ompi_osc_rdma_sendreq_send_cb(struct mca_btl_base_module_t* btl, btl->btl_free(btl, descriptor); /* any other sendreqs to restart? */ - /* BWB - FIX ME - implement sending the next sendreq here */ + while (NULL != + (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { + ompi_osc_rdma_sendreq_t *req = + (ompi_osc_rdma_sendreq_t*) item; + int ret; + + ret = ompi_osc_rdma_sendreq_send(module, req); + + if (OMPI_SUCCESS != ret) { + opal_output_verbose(5, ompi_osc_base_output, + "fence: failure in starting sendreq (%d). Will try later.", + ret); + opal_list_append(&(module->p2p_copy_pending_sendreqs), item); + + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret || + OMPI_ERR_OUT_OF_RESOURCE == ret) { + break; + } + } + } } diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.c b/ompi/mca/osc/rdma/osc_rdma_sync.c index 3b59126aa7..b090f9203f 100644 --- a/ompi/mca/osc/rdma/osc_rdma_sync.c +++ b/ompi/mca/osc/rdma/osc_rdma_sync.c @@ -64,7 +64,7 @@ ompi_osc_rdma_progress(ompi_osc_rdma_module_t *module) static inline void ompi_osc_rdma_flip_sendreqs(ompi_osc_rdma_module_t *module) { - short *tmp; + unsigned int *tmp; OPAL_THREAD_LOCK(&(module->p2p_lock)); @@ -73,7 +73,7 @@ ompi_osc_rdma_flip_sendreqs(ompi_osc_rdma_module_t *module) module->p2p_num_pending_sendreqs; module->p2p_num_pending_sendreqs = tmp; memset(module->p2p_num_pending_sendreqs, 0, - sizeof(short) * ompi_comm_size(module->p2p_comm)); + sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); /* Copy in all the pending requests */ opal_list_join(&module->p2p_copy_pending_sendreqs, @@ -87,7 +87,7 @@ ompi_osc_rdma_flip_sendreqs(ompi_osc_rdma_module_t *module) int ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) { - short incoming_reqs; + unsigned int incoming_reqs; int ret = OMPI_SUCCESS, i; if (0 != (assert & MPI_MODE_NOPRECEDE)) { @@ -120,7 +120,7 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) c_coll.coll_reduce_scatter(P2P_MODULE(win)->p2p_copy_num_pending_sendreqs, &incoming_reqs, P2P_MODULE(win)->p2p_fence_coll_counts, - MPI_SHORT, + MPI_UNSIGNED, MPI_SUM, P2P_MODULE(win)->p2p_comm); break; @@ -130,7 +130,7 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) c_coll.coll_allreduce(P2P_MODULE(win)->p2p_copy_num_pending_sendreqs, P2P_MODULE(win)->p2p_fence_coll_results, ompi_comm_size(P2P_MODULE(win)->p2p_comm), - MPI_SHORT, + MPI_UNSIGNED, MPI_SUM, P2P_MODULE(win)->p2p_comm); incoming_reqs = P2P_MODULE(win)-> @@ -141,10 +141,10 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) ret = P2P_MODULE(win)->p2p_comm-> c_coll.coll_alltoall(P2P_MODULE(win)->p2p_copy_num_pending_sendreqs, 1, - MPI_SHORT, + MPI_UNSIGNED, P2P_MODULE(win)->p2p_fence_coll_results, 1, - MPI_SHORT, + MPI_UNSIGNED, P2P_MODULE(win)->p2p_comm); incoming_reqs = 0; for (i = 0 ; i < ompi_comm_size(P2P_MODULE(win)->p2p_comm) ; ++i) { @@ -198,6 +198,11 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) "fence: failure in starting sendreq (%d). Will try later.", ret); opal_list_append(&(P2P_MODULE(win)->p2p_copy_pending_sendreqs), item); + + if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret || + OMPI_ERR_OUT_OF_RESOURCE == ret) { + break; + } } } @@ -301,11 +306,12 @@ ompi_osc_rdma_module_complete(ompi_win_t *win) OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_pending_out), P2P_MODULE(win)->p2p_copy_num_pending_sendreqs[comm_rank]); - ompi_osc_rdma_control_send(P2P_MODULE(win), - P2P_MODULE(win)->p2p_sc_group->grp_proc_pointers[i], - OMPI_OSC_RDMA_HDR_COMPLETE, - P2P_MODULE(win)->p2p_copy_num_pending_sendreqs[comm_rank], - 0); + ret = ompi_osc_rdma_control_send(P2P_MODULE(win), + P2P_MODULE(win)->p2p_sc_group->grp_proc_pointers[i], + OMPI_OSC_RDMA_HDR_COMPLETE, + P2P_MODULE(win)->p2p_copy_num_pending_sendreqs[comm_rank], + 0); + assert(ret == OMPI_SUCCESS); } /* try to start all the requests. We've copied everything we @@ -327,6 +333,7 @@ ompi_osc_rdma_module_complete(ompi_win_t *win) } /* wait for all the requests */ + ompi_osc_rdma_progress(P2P_MODULE(win)); while (0 != P2P_MODULE(win)->p2p_num_pending_out) { ompi_osc_rdma_progress(P2P_MODULE(win)); } @@ -364,7 +371,7 @@ ompi_osc_rdma_module_post(ompi_group_t *group, /* Set our mode to expose w/ post */ ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_set_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); + ompi_win_append_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); /* list how many complete counters we're still waiting on */ OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_complete_msgs),