1
1

Fixed connection inversion bug by putting in sequence checking for

first sendrecv exchanges for each connection.  This was to fix Trac
#390.

This commit was SVN r11821.
Этот коммит содержится в:
Terry Dontje 2006-09-26 13:53:00 +00:00
родитель b356fa22c9
Коммит bc93adee26
3 изменённых файлов: 37 добавлений и 4 удалений

Просмотреть файл

@ -388,11 +388,13 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
{
mca_btl_udapl_frag_t* frag;
DAT_DTO_COOKIE cookie;
static int32_t connection_seq = 1;
int rc;
/* Post a receive to get the peer's address data */
frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t));
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
sizeof(int32_t));
cookie.as_ptr = frag;
frag->type = MCA_BTL_UDAPL_CONN_RECV;
@ -407,11 +409,15 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
/* Send our local address data over this EP */
frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t));
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t) +
sizeof(int32_t));
cookie.as_ptr = frag;
memcpy(frag->segment.seg_addr.pval,
&btl->udapl_addr, sizeof(mca_btl_udapl_addr_t));
memcpy((char *)frag->segment.seg_addr.pval + sizeof(mca_btl_udapl_addr_t),
&connection_seq, sizeof(int32_t));
connection_seq++;
frag->type = MCA_BTL_UDAPL_CONN_SEND;
@ -586,6 +592,8 @@ int mca_btl_udapl_component_progress()
case MCA_BTL_UDAPL_CONN_RECV:
mca_btl_udapl_endpoint_finish_connect(btl,
frag->segment.seg_addr.pval,
(int32_t *)((char *)frag->segment.seg_addr.pval +
sizeof(mca_btl_udapl_addr_t)),
event.event_data.connect_event_data.ep_handle);
/* No break - fall through to free */
case MCA_BTL_UDAPL_CONN_SEND:

Просмотреть файл

@ -294,6 +294,7 @@ failure_create:
int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
mca_btl_udapl_addr_t* addr,
int32_t* connection_seq,
DAT_EP_HANDLE endpoint)
{
mca_btl_udapl_proc_t* proc;
@ -317,10 +318,27 @@ int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
!memcmp(addr, &ep->endpoint_addr, sizeof(DAT_SOCK_ADDR))) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
if(MCA_BTL_UDAPL_CONN_EAGER == ep->endpoint_state) {
ep->endpoint_connection_seq = *connection_seq;
ep->endpoint_eager = endpoint;
rc = mca_btl_udapl_endpoint_finish_eager(ep);
} else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) {
ep->endpoint_max = endpoint;
} else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) {
/* Check to see order of messages received are in
* the same order the actual connections are made.
* If they are not we need to swap the eager and
* max connections. This inversion is possible due
* to a race condition that one process may actually
* receive the sendrecv messages from the max connection
* before the eager connection.
*/
if (ep->endpoint_connection_seq < *connection_seq) {
/* normal order connection matching */
ep->endpoint_max = endpoint;
} else {
/* inverted order connection matching */
ep->endpoint_max = ep->endpoint_eager;
ep->endpoint_eager = endpoint;
}
rc = mca_btl_udapl_endpoint_finish_max(ep);
} else {
OPAL_OUTPUT((0, "btl_udapl ERROR invalid EP state %d\n",
@ -516,6 +534,7 @@ static void mca_btl_udapl_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
endpoint->endpoint_btl = 0;
endpoint->endpoint_proc = 0;
endpoint->endpoint_connection_seq = 0;
endpoint->endpoint_eager_sends = mca_btl_udapl_component.udapl_num_sends;
endpoint->endpoint_max_sends = mca_btl_udapl_component.udapl_num_sends;

Просмотреть файл

@ -11,6 +11,8 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -82,6 +84,9 @@ struct mca_btl_base_endpoint_t {
int32_t endpoint_max_sends;
/**< number of sends that may be posted */
int32_t endpoint_connection_seq;
/**< sequence number of sendrecv message for the connection est */
opal_mutex_t endpoint_lock;
/**< lock for concurrent access to endpoint state */
@ -118,6 +123,7 @@ void mca_btl_udapl_endpoint_post_oob_recv(void);
int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
mca_btl_udapl_addr_t* addr,
int32_t* seq,
DAT_EP_HANDLE endpoint);
#if defined(c_plusplus) || defined(__cplusplus)