1
1
Trying to remember what I did here.. eager/max messages should work now, no RDMA yet.  A number of other fixes and cleanups.

I do know of two problems:
 Bad stuff happens when flooded with send frags too quickly - the BTL doesn't handle flow control.
 Certain IBM tests turn up a length assertion in the datatype engine - needs more investigation.

This commit was SVN r10070.
Этот коммит содержится в:
Andrew Friedley 2006-05-25 15:47:59 +00:00
родитель b4e1a61cc4
Коммит 8a3d0862ca
6 изменённых файлов: 287 добавлений и 197 удалений

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -139,8 +141,25 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
}
/* Save the port with the address information */
/* TODO - since we're doing the hack below, do we need our own port? */
btl->udapl_addr.port = port;
/* TODO - big bad evil hack! */
/* uDAPL doesn't ever seem to keep track of ports with addresses. This
becomes a problem when we use dat_ep_query() to obtain a remote address
on an endpoint. In this case, both the DAT_PORT_QUAL and the sin_port
field in the DAT_SOCK_ADDR are 0, regardless of the actual port. This is
a problem when we have more than one uDAPL process per IA - these
processes will have exactly the same address, as the port is all
we have to differentiate who is who. Thus, our uDAPL EP -> BTL EP
matching algorithm will break down.
So, we insert the port we used for our PSP into the DAT_SOCK_ADDR for
this IA. uDAPL then conveniently propagates this to where we need it.
*/
((struct sockaddr_in*)attr.ia_address_ptr)->sin_port = htons(port);
((struct sockaddr_in*)&btl->udapl_addr.addr)->sin_port = htons(port);
/* initialize the memory pool */
res.udapl_ia = btl->udapl_ia;
res.udapl_pz = btl->udapl_pz;
@ -160,22 +179,22 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
/* initialize free lists */
ompi_free_list_init(&btl->udapl_frag_eager,
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_module.super.btl_eager_limit,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_component.udapl_eager_frag_size,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init(&btl->udapl_frag_max,
sizeof(mca_btl_udapl_frag_max_t) +
mca_btl_udapl_module.super.btl_max_send_size,
OBJ_CLASS(mca_btl_udapl_frag_max_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
sizeof(mca_btl_udapl_frag_max_t) +
mca_btl_udapl_component.udapl_max_frag_size,
OBJ_CLASS(mca_btl_udapl_frag_max_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init(&btl->udapl_frag_user,
sizeof(mca_btl_udapl_frag_user_t),
@ -330,14 +349,12 @@ mca_btl_base_descriptor_t* mca_btl_udapl_alloc(
mca_btl_udapl_frag_t* frag;
int rc;
OPAL_OUTPUT((0, "udapl_alloc %d\n", size));
if(size <= btl->btl_eager_limit - sizeof(mca_btl_base_header_t)) {
if(size <= btl->btl_eager_limit) {
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(udapl_btl, frag, rc);
frag->segment.seg_len =
size <= btl->btl_eager_limit ?
size : btl->btl_eager_limit;
} else if(size <= btl->btl_max_send_size - sizeof(mca_btl_base_header_t) ) {
} else if(size <= btl->btl_max_send_size) {
MCA_BTL_UDAPL_FRAG_ALLOC_MAX(udapl_btl, frag, rc);
frag->segment.seg_len =
size <= btl->btl_max_send_size ?
@ -372,8 +389,6 @@ int mca_btl_udapl_free(
{
mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)des;
OPAL_OUTPUT((0, "udapl_free\n"));
if(frag->size == 0) {
btl->btl_mpool->mpool_release(btl->btl_mpool, frag->registration);
MCA_BTL_UDAPL_FRAG_RETURN_USER(btl, frag);
@ -411,8 +426,6 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
int32_t free_after;
int rc;
OPAL_OUTPUT((0, "udapl_prepare_src\n"));
/*
* If the data has already been pinned and is contigous than we can
* use it in place.
@ -498,27 +511,28 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
*/
else
#endif
if (max_data+reserve <= btl->btl_eager_limit) {
if(max_data + reserve <= btl->btl_eager_limit) {
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
OPAL_OUTPUT((0, "udapl_prepare_src 3\n"));
//OPAL_OUTPUT((0, "udapl_prepare_src 3\n"));
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor,
&iov, &iov_count, &max_data, &free_after);
*size = max_data;
if( rc < 0 ) {
*size = max_data;
if(rc < 0) {
MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag);
return NULL;
}
frag->segment.seg_len = max_data + reserve;
frag->triplet.segment_length = max_data + reserve;
frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base;
frag->triplet.segment_length =
max_data + reserve + sizeof(mca_btl_base_header_t);
frag->triplet.virtual_address = (DAT_VADDR)frag->hdr;
}
/*
@ -526,28 +540,33 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
* that is the max send size.
*/
else {
OPAL_OUTPUT((0, "udapl_prepare_src 4\n"));
//OPAL_OUTPUT((0, "udapl_prepare_src 4\n"));
MCA_BTL_UDAPL_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
if(max_data + reserve > btl->btl_max_send_size){
max_data = btl->btl_max_send_size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor,
&iov, &iov_count, &max_data, &free_after);
*size = max_data;
*size = max_data;
if( rc < 0 ) {
if(rc < 0) {
MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag);
return NULL;
}
/* TODO - pull this out of the if statements. */
frag->segment.seg_len = max_data + reserve;
frag->triplet.segment_length = max_data + reserve;
frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base;
frag->triplet.segment_length =
max_data + reserve + sizeof(mca_btl_base_header_t);
frag->triplet.virtual_address = (DAT_VADDR)frag->hdr;
}
frag->base.des_src = &frag->segment;
@ -572,7 +591,7 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
*/
#if 0
mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
@ -629,7 +648,7 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst(
}
return &frag->base;
}
#endif
/**
* Initiate an asynchronous send.

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -149,7 +151,7 @@ int mca_btl_udapl_component_open(void)
mca_btl_udapl_component.udapl_mpool_name =
mca_btl_udapl_param_register_string("mpool", "udapl");
mca_btl_udapl_component.udapl_max_btls =
mca_btl_udapl_param_register_int("max_modules", 4);
mca_btl_udapl_param_register_int("max_modules", 8);
mca_btl_udapl_component.udapl_evd_qlen =
mca_btl_udapl_param_register_int("evd_qlen", 8);
mca_btl_udapl_component.udapl_num_repost =
@ -169,7 +171,7 @@ int mca_btl_udapl_component_open(void)
mca_btl_udapl_module.super.btl_eager_limit =
mca_btl_udapl_param_register_int ("eager_limit", 32*1024);
mca_btl_udapl_module.super.btl_min_send_size =
mca_btl_udapl_param_register_int ("min_send_size", 32*1024);
mca_btl_udapl_param_register_int ("min_send_size", 16*1024);
mca_btl_udapl_module.super.btl_max_send_size =
mca_btl_udapl_param_register_int ("max_send_size", 64*1024);
mca_btl_udapl_module.super.btl_min_rdma_size =
@ -181,9 +183,15 @@ int mca_btl_udapl_component_open(void)
/* compute udapl_eager_frag_size and udapl_max_frag_size */
mca_btl_udapl_component.udapl_eager_frag_size =
mca_btl_udapl_module.super.btl_eager_limit;
mca_btl_udapl_module.super.btl_eager_limit;
mca_btl_udapl_module.super.btl_eager_limit -=
sizeof(mca_btl_base_header_t);
mca_btl_udapl_component.udapl_max_frag_size =
mca_btl_udapl_module.super.btl_max_send_size;
mca_btl_udapl_module.super.btl_max_send_size;
mca_btl_udapl_module.super.btl_max_send_size -=
sizeof(mca_btl_base_header_t);
/* leave pinned option */
value = 0;
@ -264,8 +272,8 @@ mca_btl_udapl_component_init (int *num_btl_modules,
OPAL_OUTPUT((0, "udapl_component_init\n"));
/* enumerate uDAPL interfaces */
/* Have to do weird pointer stuff to make uDAPL happy -
just an array of DAT_PROVIDER_INFO isn't good enough. */
/* Have to do weird pointer stuff to make uDAPL happy -
just an array of DAT_PROVIDER_INFO isn't good enough. */
datinfo = malloc(sizeof(DAT_PROVIDER_INFO) *
mca_btl_udapl_component.udapl_max_btls);
datinfoptr = malloc(sizeof(DAT_PROVIDER_INFO*) *
@ -274,19 +282,19 @@ mca_btl_udapl_component_init (int *num_btl_modules,
return NULL;
}
for(i = 0; i < mca_btl_udapl_component.udapl_max_btls; i++) {
datinfoptr[i] = &datinfo[i];
}
for(i = 0; i < (int32_t)mca_btl_udapl_component.udapl_max_btls; i++) {
datinfoptr[i] = &datinfo[i];
}
if(DAT_SUCCESS != dat_registry_list_providers(
mca_btl_udapl_component.udapl_max_btls,
(DAT_COUNT*)&num_ias, &datinfo)) {
(DAT_COUNT*)&num_ias, datinfoptr)) {
free(datinfo);
free(datinfoptr);
free(datinfoptr);
return NULL;
}
free(datinfoptr);
free(datinfoptr);
/* allocate space for the each possible BTL */
mca_btl_udapl_component.udapl_btls = (mca_btl_udapl_module_t **)
@ -318,7 +326,7 @@ mca_btl_udapl_component_init (int *num_btl_modules,
}
/* successful btl creation */
mca_btl_udapl_component.udapl_btls[i] = btl;
mca_btl_udapl_component.udapl_btls[mca_btl_udapl_component.udapl_num_btls] = btl;
if(++mca_btl_udapl_component.udapl_num_btls >=
mca_btl_udapl_component.udapl_max_btls) {
break;
@ -362,8 +370,6 @@ mca_btl_udapl_component_init (int *num_btl_modules,
static int mca_btl_udapl_accept_connect(mca_btl_udapl_module_t* btl, DAT_CR_HANDLE cr_handle)
{
mca_btl_udapl_frag_t* frag;
DAT_DTO_COOKIE cookie;
DAT_EP_HANDLE endpoint;
int rc;
@ -381,21 +387,6 @@ static int mca_btl_udapl_accept_connect(mca_btl_udapl_module_t* btl, DAT_CR_HAND
return OMPI_ERROR;
}
/* Post a receive to get the address data */
frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t));
cookie.as_ptr = frag;
frag->endpoint = NULL;
frag->type = MCA_BTL_UDAPL_CONN_RECV;
rc = dat_ep_post_recv(endpoint, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_recv");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
@ -431,23 +422,17 @@ int mca_btl_udapl_component_progress()
switch(event.event_number) {
case DAT_DTO_COMPLETION_EVENT:
/* questions to answer:
should i use separate endpoints for eager/max frags?
i need to do this if i only want to post recv's for
the exact eager/max size, and uDAPL won't just pick
a large enough buffer
*/
dto = &event.event_data.dto_completion_event_data;
frag = dto->user_cookie.as_ptr;
/* Was the DTO successful? */
if(DAT_DTO_SUCCESS != dto->status) {
OPAL_OUTPUT((0,
"btl_udapl DTO error %d\n", dto->status));
"btl_udapl ***** DTO error %d *****\n",
dto->status));
break;
}
frag = dto->user_cookie.as_ptr;
switch(frag->type) {
case MCA_BTL_UDAPL_SEND:
OPAL_OUTPUT((0, "btl_udapl UDAPL_SEND %d",
@ -479,24 +464,23 @@ int mca_btl_udapl_component_progress()
frag->segment.seg_len =
frag->size - sizeof(mca_btl_base_header_t);
dat_ep_post_recv(frag->endpoint->endpoint_ep,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
if(frag->size ==
mca_btl_udapl_component.udapl_eager_frag_size) {
dat_ep_post_recv(frag->endpoint->endpoint_eager,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
} else if(frag->size ==
mca_btl_udapl_component.udapl_max_frag_size) {
dat_ep_post_recv(frag->endpoint->endpoint_max,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
} else {
OPAL_OUTPUT((0,
"btl_udapl ERROR unknown frag size\n"));
}
break;
}
case MCA_BTL_UDAPL_CONN_SEND:
/* Client (send) side connection established */
mca_btl_udapl_endpoint_post_queue(frag->endpoint);
MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag);
break;
case MCA_BTL_UDAPL_CONN_RECV:
/* Just got the address data we need for completing
a new connection - match endpoints */
mca_btl_udapl_endpoint_match(btl,
frag->segment.seg_addr.pval, dto->ep_handle);
MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag);
break;
default:
OPAL_OUTPUT((0, "WARNING unknown frag type: %d\n",
frag->type));
@ -523,10 +507,9 @@ int mca_btl_udapl_component_progress()
case DAT_CONNECTION_EVENT_ESTABLISHED:
/* Both the client and server side of a connection generate
this event */
/* Really shouldn't do anything here, as we won't have the
address data we need to match a uDAPL EP to a BTL EP.
Connections are finished when DTOs are completed for
the address transfer */
mca_btl_udapl_endpoint_finish_connect(btl,
event.event_data.connect_event_data.ep_handle);
count++;
break;

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -36,10 +38,13 @@
static void mca_btl_udapl_endpoint_send_cb(int status, orte_process_name_t* endpoint,
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata);
static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint);
static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint);
static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint,
size_t size);
void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint);
void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint,
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata);
static int mca_btl_udapl_endpoint_finish_eager(mca_btl_udapl_endpoint_t*);
static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t*);
int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint,
@ -54,8 +59,18 @@ int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint,
/* just send it already.. */
OPAL_OUTPUT((0, "sending %d bytes\n", frag->triplet.segment_length));
cookie.as_ptr = frag;
rc = dat_ep_post_send(endpoint->endpoint_ep, 1, &frag->triplet,
cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(frag->size ==
mca_btl_udapl_component.udapl_eager_frag_size) {
rc = dat_ep_post_send(endpoint->endpoint_eager, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
} else if(frag->size ==
mca_btl_udapl_component.udapl_max_frag_size) {
rc = dat_ep_post_send(endpoint->endpoint_max, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
} else {
OPAL_OUTPUT((0, "btl_udapl ERROR unknown frag size\n"));
}
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send");
rc = OMPI_ERROR;
@ -71,8 +86,10 @@ int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint,
}
/* Fall through on purpose to queue the send */
case MCA_BTL_UDAPL_CONNECTING:
case MCA_BTL_UDAPL_CONN_EAGER:
case MCA_BTL_UDAPL_CONN_MAX:
/* Add this send to a queue */
OPAL_OUTPUT((0, "queueing send %d bytes\n", frag->triplet.segment_length));
opal_list_append(&endpoint->endpoint_frags,
(opal_list_item_t*)frag);
break;
@ -125,7 +142,7 @@ static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint)
return rc;
}
endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTING;
endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_EAGER;
return OMPI_SUCCESS;
}
@ -165,7 +182,7 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint,
if(0 == orte_ns.compare(ORTE_NS_CMP_ALL, &proc->proc_guid, endpoint)) {
for(i = 0; i < proc->proc_endpoint_count; i++) {
ep = proc->proc_endpoints[i];
/* Does this endpoint match? */
if(!memcmp(&addr, &ep->endpoint_addr,
sizeof(mca_btl_udapl_addr_t))) {
@ -178,6 +195,7 @@ void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint,
}
}
OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock);
OBJ_RELEASE(buffer);
}
@ -195,20 +213,18 @@ void mca_btl_udapl_endpoint_post_oob_recv(void)
void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint)
{
mca_btl_udapl_module_t* btl = endpoint->endpoint_btl;
mca_btl_udapl_frag_t* frag;
DAT_DTO_COOKIE cookie;
int rc;
OPAL_THREAD_LOCK(&endpoint->endpoint_send_lock);
/* Nasty test to prevent deadlock and unwanted connection attempts */
/* This right here is the whole point of using the ORTE/RML handshake */
if((MCA_BTL_UDAPL_CONNECTING == endpoint->endpoint_state &&
if((MCA_BTL_UDAPL_CONN_EAGER == endpoint->endpoint_state &&
0 > orte_ns.compare(ORTE_NS_CMP_ALL,
&endpoint->endpoint_proc->proc_guid,
&ompi_proc_local()->proc_name)) ||
(MCA_BTL_UDAPL_CLOSED != endpoint->endpoint_state &&
MCA_BTL_UDAPL_CONNECTING != endpoint->endpoint_state)) {
MCA_BTL_UDAPL_CONN_EAGER != endpoint->endpoint_state)) {
OPAL_THREAD_UNLOCK(&endpoint->endpoint_send_lock);
return;
}
@ -216,48 +232,28 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint)
/* Create a new uDAPL endpoint and start the connection process */
rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz,
btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn,
NULL, &endpoint->endpoint_ep);
NULL, &endpoint->endpoint_eager);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create");
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create (eager)");
goto failure_create;
}
rc = dat_ep_connect(endpoint->endpoint_ep, &endpoint->endpoint_addr.addr,
rc = dat_ep_connect(endpoint->endpoint_eager, &endpoint->endpoint_addr.addr,
endpoint->endpoint_addr.port, mca_btl_udapl_component.udapl_timeout,
0, NULL, 0, DAT_CONNECT_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_connect");
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_connect (eager)");
goto failure;
}
/* Send our local address data over this EP */
/* Can't use btl_udapl_send here, will start an infinite loop! */
frag = (mca_btl_udapl_frag_t*)mca_btl_udapl_alloc(
(mca_btl_base_module_t*)btl, sizeof(mca_btl_udapl_addr_t));
cookie.as_ptr = frag;
memcpy(frag->segment.seg_addr.pval,
&btl->udapl_addr, sizeof(mca_btl_udapl_addr_t));
frag->endpoint = endpoint;
frag->type = MCA_BTL_UDAPL_CONN_SEND;
/* Do the actual send now.. */
rc = dat_ep_post_send(endpoint->endpoint_ep, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send");
goto failure;
}
endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTING;
endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_EAGER;
OPAL_THREAD_UNLOCK(&endpoint->endpoint_send_lock);
return;
failure:
dat_ep_free(endpoint->endpoint_ep);
dat_ep_free(endpoint->endpoint_eager);
failure_create:
endpoint->endpoint_ep = DAT_HANDLE_NULL;
endpoint->endpoint_eager = DAT_HANDLE_NULL;
endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED;
OPAL_THREAD_UNLOCK(&endpoint->endpoint_send_lock);
return;
@ -265,46 +261,29 @@ failure_create:
/*
* Post queued sends.
* Finish establishing a connection
*/
int mca_btl_udapl_endpoint_post_queue(mca_btl_udapl_endpoint_t* endpoint)
{
mca_btl_udapl_frag_t* frag;
DAT_DTO_COOKIE cookie;
int rc = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&endpoint->endpoint_send_lock);
endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED;
while(NULL != (frag = (mca_btl_udapl_frag_t*)
opal_list_remove_first(&endpoint->endpoint_frags))) {
cookie.as_ptr = frag;
rc = dat_ep_post_send(endpoint->endpoint_ep, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send");
rc = OMPI_ERROR;
break;
}
}
OPAL_THREAD_UNLOCK(&endpoint->endpoint_send_lock);
return mca_btl_udapl_endpoint_post_recv(endpoint);
}
/*
* Match a uDAPL endpoint to a BTL endpoint.
*/
int mca_btl_udapl_endpoint_match(struct mca_btl_udapl_module_t* btl,
mca_btl_udapl_addr_t* addr,
DAT_EP_HANDLE endpoint)
int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
DAT_EP_HANDLE endpoint)
{
mca_btl_udapl_proc_t* proc;
mca_btl_base_endpoint_t* ep;
DAT_EP_PARAM param;
size_t i;
int rc;
/* Query the DAT EP for address information. */
/* TODO - refer to the hack comment about setting the port in btl_udapl.c */
rc = dat_ep_query(endpoint,
DAT_EP_FIELD_REMOTE_IA_ADDRESS_PTR | DAT_EP_FIELD_REMOTE_PORT_QUAL,
&param);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_query");
return OMPI_ERROR;
}
/* Search for the matching BTL EP */
OPAL_THREAD_LOCK(&mca_btl_udapl_component.udapl_lock);
for(proc = (mca_btl_udapl_proc_t*)
opal_list_get_first(&mca_btl_udapl_component.udapl_procs);
@ -314,16 +293,29 @@ int mca_btl_udapl_endpoint_match(struct mca_btl_udapl_module_t* btl,
for(i = 0; i < proc->proc_endpoint_count; i++) {
ep = proc->proc_endpoints[i];
/* Does this endpoint match? */
/* TODO - Check that the DAT_CONN_QUAL's match too */
if(ep->endpoint_btl == btl &&
!memcmp(addr, &ep->endpoint_addr,
sizeof(mca_btl_udapl_addr_t))) {
!memcmp(param.remote_ia_address_ptr,
&ep->endpoint_addr.addr, sizeof(DAT_SOCK_ADDR))) {
OPAL_OUTPUT((0, "btl_udapl matched endpoint!\n"));
ep->endpoint_ep = endpoint;
mca_btl_udapl_endpoint_post_queue(ep);
OPAL_THREAD_LOCK(&endpoint->endpoint_send_lock);
if(MCA_BTL_UDAPL_CONN_EAGER == ep->endpoint_state) {
ep->endpoint_eager = endpoint;
rc = mca_btl_udapl_endpoint_finish_eager(ep);
} else if(MCA_BTL_UDAPL_CONN_MAX == ep->endpoint_state) {
ep->endpoint_max = endpoint;
rc = mca_btl_udapl_endpoint_finish_max(ep);
} else {
OPAL_OUTPUT((0, "btl_udapl ERROR invalid EP state %d\n",
ep->endpoint_state));
return OMPI_ERROR;
}
OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock);
return OMPI_SUCCESS;
return rc;
}
}
}
@ -335,27 +327,117 @@ int mca_btl_udapl_endpoint_match(struct mca_btl_udapl_module_t* btl,
}
/*
* Finish setting up an eager connection, start a max connection
*/
static int mca_btl_udapl_endpoint_finish_eager(
mca_btl_udapl_endpoint_t* endpoint)
{
mca_btl_udapl_module_t* btl = endpoint->endpoint_btl;
int rc;
endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_MAX;
OPAL_THREAD_UNLOCK(&endpoint->endpoint_send_lock);
/* Only one side does dat_ep_connect() */
if(0 > orte_ns.compare(ORTE_NS_CMP_ALL,
&endpoint->endpoint_proc->proc_guid,
&ompi_proc_local()->proc_name)) {
rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz,
btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn,
NULL, &endpoint->endpoint_max);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create (max)");
return OMPI_ERROR;
}
rc = dat_ep_connect(endpoint->endpoint_max,
&endpoint->endpoint_addr.addr, endpoint->endpoint_addr.port,
mca_btl_udapl_component.udapl_timeout,
0, NULL, 0, DAT_CONNECT_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_connect (max)");
dat_ep_free(endpoint->endpoint_max);
return OMPI_ERROR;
}
}
return OMPI_SUCCESS;
}
static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint)
{
mca_btl_udapl_frag_t* frag;
DAT_DTO_COOKIE cookie;
int ret = OMPI_SUCCESS;
int rc;
endpoint->endpoint_state = MCA_BTL_UDAPL_CONNECTED;
/* post eager/max recv buffers */
mca_btl_udapl_endpoint_post_recv(endpoint,
mca_btl_udapl_component.udapl_eager_frag_size);
mca_btl_udapl_endpoint_post_recv(endpoint,
mca_btl_udapl_component.udapl_max_frag_size);
/* post queued sends */
while(NULL != (frag = (mca_btl_udapl_frag_t*)
opal_list_remove_first(&endpoint->endpoint_frags))) {
cookie.as_ptr = frag;
if(frag->size ==
mca_btl_udapl_component.udapl_eager_frag_size) {
rc = dat_ep_post_send(endpoint->endpoint_eager, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
} else if(frag->size == mca_btl_udapl_component.udapl_max_frag_size) {
rc = dat_ep_post_send(endpoint->endpoint_max, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
} else {
OPAL_OUTPUT((0, "btl_udapl ERROR unknown frag size\n"));
rc = !DAT_SUCCESS;
}
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send");
endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED;
ret = OMPI_ERROR;
break;
}
}
OPAL_THREAD_UNLOCK(&endpoint->endpoint_send_lock);
return ret;
}
/*
* Post receive buffers for a newly established endpoint connection.
*/
static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint)
static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint,
size_t size)
{
mca_btl_udapl_frag_t* frag;
mca_btl_udapl_frag_t* frag = NULL;
DAT_DTO_COOKIE cookie;
int rc;
int i;
/* TODO - only posting eager frags for now. */
OPAL_THREAD_LOCK(&endpoint->endpoint_recv_lock);
for(i = 0; i < mca_btl_udapl_component.udapl_num_repost; i++) {
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(endpoint->endpoint_btl, frag, rc);
if(size == mca_btl_udapl_component.udapl_eager_frag_size) {
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(endpoint->endpoint_btl, frag, rc);
} else if(size == mca_btl_udapl_component.udapl_max_frag_size) {
MCA_BTL_UDAPL_FRAG_ALLOC_MAX(endpoint->endpoint_btl, frag, rc);
}
/* Set up the LMR triplet from the frag segment */
/* Note that this triplet defines a sub-region of a registered LMR */
frag->triplet.virtual_address = (DAT_VADDR)frag->hdr;
frag->triplet.segment_length =
mca_btl_udapl_module.super.btl_eager_limit;
frag->triplet.segment_length = frag->size;
frag->btl = endpoint->endpoint_btl;
frag->endpoint = endpoint;
@ -365,8 +447,16 @@ static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint)
cookie.as_ptr = frag;
rc = dat_ep_post_recv(endpoint->endpoint_ep, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(size == mca_btl_udapl_component.udapl_eager_frag_size) {
rc = dat_ep_post_recv(frag->endpoint->endpoint_eager, 1,
&frag->triplet,cookie, DAT_COMPLETION_DEFAULT_FLAG);
} else if(size == mca_btl_udapl_component.udapl_max_frag_size) {
rc = dat_ep_post_recv(frag->endpoint->endpoint_max, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
} else {
rc = !DAT_SUCCESS;
}
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_recv");
OPAL_THREAD_UNLOCK(&endpoint->endpoint_recv_lock);
@ -389,7 +479,8 @@ static void mca_btl_udapl_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
endpoint->endpoint_btl = 0;
endpoint->endpoint_proc = 0;
endpoint->endpoint_state = MCA_BTL_UDAPL_CLOSED;
endpoint->endpoint_ep = DAT_HANDLE_NULL;
endpoint->endpoint_eager = DAT_HANDLE_NULL;
endpoint->endpoint_max = DAT_HANDLE_NULL;
OBJ_CONSTRUCT(&endpoint->endpoint_frags, opal_list_t);
OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t);

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -45,7 +47,8 @@ typedef struct mca_btl_udapl_addr_t mca_btl_udapl_addr_t;
*/
typedef enum {
MCA_BTL_UDAPL_CONNECTING,
MCA_BTL_UDAPL_CONN_EAGER,
MCA_BTL_UDAPL_CONN_MAX,
MCA_BTL_UDAPL_CONNECTED,
MCA_BTL_UDAPL_CLOSED,
MCA_BTL_UDAPL_FAILED
@ -82,7 +85,8 @@ struct mca_btl_base_endpoint_t {
mca_btl_udapl_addr_t endpoint_addr;
DAT_EP_HANDLE endpoint_ep;
DAT_EP_HANDLE endpoint_eager;
DAT_EP_HANDLE endpoint_max;
/**< uDAPL endpoint handle */
};
@ -106,18 +110,11 @@ int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint,
void mca_btl_udapl_endpoint_post_oob_recv(void);
/*
* Post queued sends.
* Finish establishing a connection
*/
int mca_btl_udapl_endpoint_post_queue(mca_btl_udapl_endpoint_t* endpoint);
/*
* Match a uDAPL endpoint to a BTL endpoint.
*/
int mca_btl_udapl_endpoint_match(struct mca_btl_udapl_module_t* btl,
mca_btl_udapl_addr_t* addr,
DAT_EP_HANDLE endpoint);
int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
DAT_EP_HANDLE endpoint);
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -48,16 +50,14 @@ static void mca_btl_udapl_frag_common_constructor(mca_btl_udapl_frag_t* frag)
static void mca_btl_udapl_frag_eager_constructor(mca_btl_udapl_frag_t* frag)
{
frag->segment.seg_len = mca_btl_udapl_module.super.btl_eager_limit -
sizeof(mca_btl_base_header_t);
frag->segment.seg_len = mca_btl_udapl_module.super.btl_eager_limit;
frag->size = mca_btl_udapl_component.udapl_eager_frag_size;
mca_btl_udapl_frag_common_constructor(frag);
}
static void mca_btl_udapl_frag_max_constructor(mca_btl_udapl_frag_t* frag)
{
frag->segment.seg_len = mca_btl_udapl_module.super.btl_max_send_size -
sizeof(mca_btl_base_header_t);
frag->segment.seg_len = mca_btl_udapl_module.super.btl_max_send_size;
frag->size = mca_btl_udapl_component.udapl_max_frag_size;
mca_btl_udapl_frag_common_constructor(frag);
}

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -30,8 +32,6 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_t);
typedef enum {
MCA_BTL_UDAPL_CONN_SEND,
MCA_BTL_UDAPL_CONN_RECV,
MCA_BTL_UDAPL_SEND,
MCA_BTL_UDAPL_RECV,
MCA_BTL_UDAPL_PUT,