Fixed connection inversion bug by putting in sequence checking for
first sendrecv exchanges for each connection. This was to fix Trac #390. This commit was SVN r11821.
Этот коммит содержится в:
родитель
b356fa22c9
Коммит
bc93adee26
@ -388,11 +388,13 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
|
||||
{
|
||||
mca_btl_udapl_frag_t* frag;
|
||||
DAT_DTO_COOKIE cookie;
|
||||
static int32_t connection_seq = 1;
|
||||
int rc;
|
||||
|
||||
/* Post a receive to get the peer's address data */
|
||||
frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
|
||||
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t));
|
||||
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
|
||||
sizeof(int32_t));
|
||||
cookie.as_ptr = frag;
|
||||
|
||||
frag->type = MCA_BTL_UDAPL_CONN_RECV;
|
||||
@ -407,11 +409,15 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
|
||||
|
||||
/* Send our local address data over this EP */
|
||||
frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
|
||||
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t));
|
||||
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
|
||||
sizeof(int32_t));
|
||||
cookie.as_ptr = frag;
|
||||
|
||||
memcpy(frag->segment.seg_addr.pval,
|
||||
&btl->udapl_addr, sizeof(mca_btl_udapl_addr_t));
|
||||
memcpy((char *)frag->segment.seg_addr.pval + sizeof(mca_btl_udapl_addr_t),
|
||||
&connection_seq, sizeof(int32_t));
|
||||
connection_seq++;
|
||||
|
||||
frag->type = MCA_BTL_UDAPL_CONN_SEND;
|
||||
|
||||
@ -586,6 +592,8 @@ int mca_btl_udapl_component_progress()
|
||||
case MCA_BTL_UDAPL_CONN_RECV:
|
||||
mca_btl_udapl_endpoint_finish_connect(btl,
|
||||
frag->segment.seg_addr.pval,
|
||||
(int32_t *)((char *)frag->segment.seg_addr.pval +
|
||||
sizeof(mca_btl_udapl_addr_t)),
|
||||
event.event_data.connect_event_data.ep_handle);
|
||||
/* No break - fall through to free */
|
||||
case MCA_BTL_UDAPL_CONN_SEND:
|
||||
|
@ -294,6 +294,7 @@ failure_create:
|
||||
|
||||
int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
|
||||
mca_btl_udapl_addr_t* addr,
|
||||
int32_t* connection_seq,
|
||||
DAT_EP_HANDLE endpoint)
|
||||
{
|
||||
mca_btl_udapl_proc_t* proc;
|
||||
@ -317,10 +318,27 @@ int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
|
||||
!memcmp(addr, &ep->endpoint_addr, sizeof(DAT_SOCK_ADDR))) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
if(MCA_BTL_UDAPL_CONN_EAGER == ep->endpoint_state) {
|
||||
ep->endpoint_connection_seq = *connection_seq;
|
||||
ep->endpoint_eager = endpoint;
|
||||
rc = mca_btl_udapl_endpoint_finish_eager(ep);
|
||||
} else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) {
|
||||
ep->endpoint_max = endpoint;
|
||||
} else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) {
|
||||
/* Check to see order of messages received are in
|
||||
* the same order the actual connections are made.
|
||||
* If they are not we need to swap the eager and
|
||||
* max connections. This inversion is possible due
|
||||
* to a race condition that one process may actually
|
||||
* receive the sendrecv messages from the max connection
|
||||
* before the eager connection.
|
||||
*/
|
||||
if (ep->endpoint_connection_seq < *connection_seq) {
|
||||
/* normal order connection matching */
|
||||
ep->endpoint_max = endpoint;
|
||||
} else {
|
||||
/* inverted order connection matching */
|
||||
ep->endpoint_max = ep->endpoint_eager;
|
||||
ep->endpoint_eager = endpoint;
|
||||
}
|
||||
|
||||
rc = mca_btl_udapl_endpoint_finish_max(ep);
|
||||
} else {
|
||||
OPAL_OUTPUT((0, "btl_udapl ERROR invalid EP state %d\n",
|
||||
@ -516,6 +534,7 @@ static void mca_btl_udapl_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||
endpoint->endpoint_btl = 0;
|
||||
endpoint->endpoint_proc = 0;
|
||||
|
||||
endpoint->endpoint_connection_seq = 0;
|
||||
endpoint->endpoint_eager_sends = mca_btl_udapl_component.udapl_num_sends;
|
||||
endpoint->endpoint_max_sends = mca_btl_udapl_component.udapl_num_sends;
|
||||
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -82,6 +84,9 @@ struct mca_btl_base_endpoint_t {
|
||||
int32_t endpoint_max_sends;
|
||||
/**< number of sends that may be posted */
|
||||
|
||||
int32_t endpoint_connection_seq;
|
||||
/**< sequence number of sendrecv message for the connection est */
|
||||
|
||||
opal_mutex_t endpoint_lock;
|
||||
/**< lock for concurrent access to endpoint state */
|
||||
|
||||
@ -118,6 +123,7 @@ void mca_btl_udapl_endpoint_post_oob_recv(void);
|
||||
|
||||
int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
|
||||
mca_btl_udapl_addr_t* addr,
|
||||
int32_t* seq,
|
||||
DAT_EP_HANDLE endpoint);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user