1
1

* Fix race condition in post/wait/start/complete synchronization where one

epoch's control data could overwrite the previous epoch's data because
    we were reusing data structures between PW and SC.  Instead, we now
    have explicit post_msg and complete_msg counters for completion.

    refs trac:354

  * Only register the rdma osc callback once, as it turns out that some
    btls (MX) do somethng more than update a table during the register
    call, and each register call sucks up valuable fragments...

This commit was SVN r11745.

The following Trac tickets were found above:
  Ticket 354 --> https://svn.open-mpi.org/trac/ompi/ticket/354
Этот коммит содержится в:
Brian Barrett 2006-09-21 19:57:57 +00:00
родитель 443af2a259
Коммит 2ec0c4f593
6 изменённых файлов: 45 добавлений и 26 удалений

Просмотреть файл

@ -95,20 +95,26 @@ struct ompi_osc_pt2pt_module_t {
/** For MPI_Fence synchronization, the number of messages to send
in epoch. For Start/Complete, the number of updates for this
Complete. For Post/Wait (poorly named), the number of
Complete counters we're waiting for. For lock, the number of
Complete. For lock, the number of
messages waiting for completion on on the origin side. Not
protected by p2p_lock - must use atomic counter operations. */
volatile int32_t p2p_num_pending_out;
/** For MPI_Fence synchronization, the number of expected incoming
messages. For Start/Complete, the number of expected Post
messages. For Post/Wait, the number of expected updates from
complete. For lock, the number of messages on the passive side
we are waiting for. Not protected by p2p_lock - must use
atomic counter operations. */
volatile int32_t p2p_num_pending_in;
/** Number of "ping" messages from the remote post group we've
received */
volatile int32_t p2p_num_post_msgs;
/** Number of "count" messages from the remote complete group
we've received */
volatile int32_t p2p_num_complete_msgs;
/** cyclic counter for a unique tag for long messages. Not
protected by the p2p_lock - must use create_send_tag() to
create a send tag */

Просмотреть файл

@ -279,6 +279,8 @@ ompi_osc_pt2pt_component_select(ompi_win_t *win,
module->p2p_num_pending_out = 0;
module->p2p_num_pending_in = 0;
module->p2p_num_post_msgs = 0;
module->p2p_num_complete_msgs = 0;
module->p2p_tag_counter = 0;
OBJ_CONSTRUCT(&(module->p2p_long_msgs), opal_list_t);
@ -533,7 +535,7 @@ ompi_osc_pt2pt_component_fragment_cb(ompi_osc_pt2pt_module_t *module,
assert(module == ompi_osc_pt2pt_windx_to_module(header->hdr_windx));
OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), -1);
OPAL_THREAD_ADD32(&(module->p2p_num_post_msgs), -1);
}
break;
case OMPI_OSC_PT2PT_HDR_COMPLETE:
@ -552,7 +554,7 @@ ompi_osc_pt2pt_component_fragment_cb(ompi_osc_pt2pt_module_t *module,
/* we've heard from one more place, and have value reqs to
process */
OPAL_THREAD_ADD32(&(module->p2p_num_pending_out), -1);
OPAL_THREAD_ADD32(&(module->p2p_num_complete_msgs), -1);
OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), header->hdr_value[0]);
}
break;

Просмотреть файл

@ -204,7 +204,7 @@ ompi_osc_pt2pt_module_start(ompi_group_t *group,
/* possible we've already received a couple in messages, so
atomicall add however many we're going to wait for */
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_pending_in),
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_post_msgs),
ompi_group_size(P2P_MODULE(win)->p2p_sc_group));
return OMPI_SUCCESS;
@ -220,7 +220,7 @@ ompi_osc_pt2pt_module_complete(ompi_win_t *win)
opal_list_item_t *item;
/* wait for all the post messages */
while (0 != P2P_MODULE(win)->p2p_num_pending_in) {
while (0 != P2P_MODULE(win)->p2p_num_post_msgs) {
ompi_osc_pt2pt_progress_long(P2P_MODULE(win));
}
@ -318,7 +318,7 @@ ompi_osc_pt2pt_module_post(ompi_group_t *group,
ompi_win_set_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED);
/* list how many complete counters we're still waiting on */
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_pending_out),
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_complete_msgs),
ompi_group_size(P2P_MODULE(win)->p2p_pw_group));
/* send a hello counter to everyone in group */
@ -338,7 +338,7 @@ ompi_osc_pt2pt_module_wait(ompi_win_t *win)
ompi_group_t *group;
while (0 != (P2P_MODULE(win)->p2p_num_pending_in) ||
0 != (P2P_MODULE(win)->p2p_num_pending_out)) {
0 != (P2P_MODULE(win)->p2p_num_complete_msgs)) {
ompi_osc_pt2pt_progress_long(P2P_MODULE(win));
}
@ -364,10 +364,10 @@ ompi_osc_pt2pt_module_test(ompi_win_t *win,
ompi_group_t *group;
if (0 != (P2P_MODULE(win)->p2p_num_pending_in) ||
0 != (P2P_MODULE(win)->p2p_num_pending_out)) {
0 != (P2P_MODULE(win)->p2p_num_complete_msgs)) {
ompi_osc_pt2pt_progress_long(P2P_MODULE(win));
if (0 != (P2P_MODULE(win)->p2p_num_pending_in) ||
0 != (P2P_MODULE(win)->p2p_num_pending_out)) {
0 != (P2P_MODULE(win)->p2p_num_complete_msgs)) {
*flag = 0;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -79,20 +79,26 @@ struct ompi_osc_rdma_module_t {
/** For MPI_Fence synchronization, the number of messages to send
in epoch. For Start/Complete, the number of updates for this
Complete. For Post/Wait (poorly named), the number of
Complete counters we're waiting for. For lock, the number of
Complete. For lock, the number of
messages waiting for completion on on the origin side. Not
protected by p2p_lock - must use atomic counter operations. */
volatile int32_t p2p_num_pending_out;
/** For MPI_Fence synchronization, the number of expected incoming
messages. For Start/Complete, the number of expected Post
messages. For Post/Wait, the number of expected updates from
complete. For lock, the number of messages on the passive side
we are waiting for. Not protected by p2p_lock - must use
atomic counter operations. */
volatile int32_t p2p_num_pending_in;
/** Number of "ping" messages from the remote post group we've
received */
volatile int32_t p2p_num_post_msgs;
/** Number of "count" messages from the remote complete group
we've received */
volatile int32_t p2p_num_complete_msgs;
/** cyclic counter for a unique tage for long messages. Not
protected by the p2p_lock - must use create_send_tag() to
create a send tag */

Просмотреть файл

@ -36,6 +36,7 @@
#include "ompi/datatype/dt_arch.h"
static int ompi_osc_rdma_component_open(void);
static int32_t registered_callback = 0;
ompi_osc_rdma_component_t mca_osc_rdma_component = {
{ /* ompi_osc_base_component_t */
@ -278,6 +279,8 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
module->p2p_num_pending_out = 0;
module->p2p_num_pending_in = 0;
module->p2p_num_post_msgs = 0;
module->p2p_num_complete_msgs = 0;
module->p2p_tag_counter = 0;
OBJ_CONSTRUCT(&(module->p2p_long_msgs), opal_list_t);
@ -376,10 +379,12 @@ ompi_osc_rdma_component_select(ompi_win_t *win,
/* sync memory - make sure all initialization completed */
opal_atomic_mb();
/* register to receive fragment callbacks */
ret = mca_bml.bml_register(MCA_BTL_TAG_OSC_RDMA,
ompi_osc_rdma_component_fragment_cb,
NULL);
/* register to receive fragment callbacks, if not already done */
if (OPAL_THREAD_ADD32(&registered_callback, 1) <= 1) {
ret = mca_bml.bml_register(MCA_BTL_TAG_OSC_RDMA,
ompi_osc_rdma_component_fragment_cb,
NULL);
}
if (module->p2p_eager_send) {
@ -558,7 +563,7 @@ ompi_osc_rdma_component_fragment_cb(struct mca_btl_base_module_t *btl,
module = ompi_osc_rdma_windx_to_module(header->hdr_windx);
if (NULL == module) return;
OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), -1);
OPAL_THREAD_ADD32(&(module->p2p_num_post_msgs), -1);
}
break;
case OMPI_OSC_RDMA_HDR_COMPLETE:
@ -579,7 +584,7 @@ ompi_osc_rdma_component_fragment_cb(struct mca_btl_base_module_t *btl,
/* we've heard from one more place, and have value reqs to
process */
OPAL_THREAD_ADD32(&(module->p2p_num_pending_out), -1);
OPAL_THREAD_ADD32(&(module->p2p_num_complete_msgs), -1);
OPAL_THREAD_ADD32(&(module->p2p_num_pending_in), header->hdr_value[0]);
}
break;

Просмотреть файл

@ -239,7 +239,7 @@ ompi_osc_rdma_module_start(ompi_group_t *group,
/* possible we've already received a couple in messages, so
atomicall add however many we're going to wait for */
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_pending_in),
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_post_msgs),
ompi_group_size(P2P_MODULE(win)->p2p_sc_group));
return OMPI_SUCCESS;
@ -255,7 +255,7 @@ ompi_osc_rdma_module_complete(ompi_win_t *win)
opal_list_item_t *item;
/* wait for all the post messages */
while (0 != P2P_MODULE(win)->p2p_num_pending_in) {
while (0 != P2P_MODULE(win)->p2p_num_post_msgs) {
ompi_osc_rdma_progress(P2P_MODULE(win));
}
@ -353,7 +353,7 @@ ompi_osc_rdma_module_post(ompi_group_t *group,
ompi_win_set_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED);
/* list how many complete counters we're still waiting on */
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_pending_out),
OPAL_THREAD_ADD32(&(P2P_MODULE(win)->p2p_num_complete_msgs),
ompi_group_size(P2P_MODULE(win)->p2p_pw_group));
/* send a hello counter to everyone in group */
@ -373,7 +373,7 @@ ompi_osc_rdma_module_wait(ompi_win_t *win)
ompi_group_t *group;
while (0 != (P2P_MODULE(win)->p2p_num_pending_in) ||
0 != (P2P_MODULE(win)->p2p_num_pending_out)) {
0 != (P2P_MODULE(win)->p2p_num_complete_msgs)) {
ompi_osc_rdma_progress(P2P_MODULE(win));
}
@ -399,10 +399,10 @@ ompi_osc_rdma_module_test(ompi_win_t *win,
ompi_group_t *group;
if (0 != (P2P_MODULE(win)->p2p_num_pending_in) ||
0 != (P2P_MODULE(win)->p2p_num_pending_out)) {
0 != (P2P_MODULE(win)->p2p_num_complete_msgs)) {
ompi_osc_rdma_progress(P2P_MODULE(win));
if (0 != (P2P_MODULE(win)->p2p_num_pending_in) ||
0 != (P2P_MODULE(win)->p2p_num_pending_out)) {
0 != (P2P_MODULE(win)->p2p_num_complete_msgs)) {
*flag = 0;
return OMPI_SUCCESS;
}