1
1

Change over from lazy connection setup to setting up at initialization

time.

UD is connectionless, and as long as peers are statically assigned to QPs,
there is no reason to set up the adressing information lazily.

Lots of code was axed, as endpoints no longer have state.  Removed a
number of other elements in the endpoint struct to make it as lightweight
as possible.

I was able to remove an entire function call/branch in the send path,
which I believe is the main contributor to a 2us drop in NetPIPE latency.

Some whitespace cleanups as well.

Passes IBM test suite, and all but certain intel tests that were failing
before the change, over ob1 PML.

This commit was SVN r10494.
Этот коммит содержится в:
Andrew Friedley 2006-06-23 16:50:50 +00:00
родитель f08e54029c
Коммит 7bfac82ce7
8 изменённых файлов: 155 добавлений и 478 удалений

Просмотреть файл

@ -84,6 +84,7 @@ int mca_btl_ud_add_procs(
ompi_bitmap_t* reachable)
{
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
struct ibv_ah_attr ah_attr;
int i, rc;
for(i = 0; i < (int) nprocs; i++) {
@ -114,8 +115,6 @@ int mca_btl_ud_add_procs(
return OMPI_ERR_OUT_OF_RESOURCE;
}
ib_peer->endpoint_btl = ud_btl;
ib_peer->subnet = ud_btl->port_info.subnet;
rc = mca_btl_ud_proc_insert(ib_proc, ib_peer);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(ib_peer);
@ -123,13 +122,43 @@ int mca_btl_ud_add_procs(
continue;
}
BTL_VERBOSE(("modex_recv HP QP num %d, LP QP num %d, LID = %d",
ib_peer->rem_addr.qp_num_hp,
ib_peer->rem_addr.qp_num_lp,
ib_peer->rem_addr.lid));
/* Set up IB address handles for the endpoint */
ah_attr.is_global = 0;
ah_attr.dlid = ib_peer->rem_addr.lid;
ah_attr.sl = mca_btl_ud_component.ib_service_level;
ah_attr.src_path_bits = mca_btl_ud_component.ib_src_path_bits;
ah_attr.port_num = ud_btl->port_num;
ib_peer->rmt_ah_hp = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
if(NULL == ib_peer->rmt_ah_hp) {
BTL_ERROR(("error creating address handle errno says %s\n",
strerror(errno)));
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
continue;
}
ib_peer->rmt_ah_lp = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
if(NULL == ib_peer) {
BTL_ERROR(("error creating address handle errno says %s\n",
strerror(errno)));
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
continue;
}
ompi_bitmap_set_bit(reachable, i);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
peers[i] = ib_peer;
}
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
if(mca_btl_ud_component.use_srq) {
ud_btl->rd_num = mca_btl_ud_component.rd_num + log2(nprocs) * mca_btl_ud_component.srq_rd_per_peer;
ud_btl->rd_num = mca_btl_ud_component.rd_num +
log2(nprocs) * mca_btl_ud_component.srq_rd_per_peer;
if(ud_btl->rd_num > mca_btl_ud_component.srq_rd_max)
ud_btl->rd_num = mca_btl_ud_component.srq_rd_max;
}
@ -422,7 +451,6 @@ int mca_btl_ud_send(
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag)
{
int rc;
@ -431,7 +459,12 @@ int mca_btl_ud_send(
MCA_BTL_UD_START_TIME(post_send);
frag->endpoint = endpoint;
frag->hdr->tag = tag;
rc = mca_btl_ud_endpoint_send(endpoint, frag);
/*OPAL_THREAD_LOCK(&endpoint->endpoint_lock);*/
rc = mca_btl_ud_endpoint_post_send(
(mca_btl_ud_module_t*)btl, endpoint, frag);
/*OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);*/
MCA_BTL_UD_END_TIME(post_send);
return rc;
}
@ -454,8 +487,7 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
ud_btl->ib_pd = ibv_alloc_pd(ctx);
if(NULL == ud_btl->ib_pd) {
BTL_ERROR(("error allocating pd for %s errno says %s\n",
ibv_get_device_name(ud_btl->ib_dev),
strerror(errno)));
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
return OMPI_ERROR;
}
@ -524,6 +556,10 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
return OMPI_ERROR;
}
/* Set up our packet sequence numbers */
ud_btl->addr.psn_hp = lrand48() & 0xffffff;
ud_btl->addr.psn_lp = lrand48() & 0xffffff;
/* Set up the QPs for this BTL */
if(OMPI_SUCCESS != mca_btl_ud_endpoint_init_qp(&ud_btl->super,
ud_btl->ib_cq_hp,
@ -531,7 +567,7 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
ud_btl->srq_hp,
#endif
&ud_btl->qp_hp,
ud_btl->psn_hp)) {
ud_btl->addr.psn_hp)) {
return OMPI_ERROR;
}
@ -541,10 +577,14 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
ud_btl->srq_lp,
#endif
&ud_btl->qp_lp,
ud_btl->psn_lp)) {
ud_btl->addr.psn_lp)) {
return OMPI_ERROR;
}
/* Place our QP numbers in our local address information */
ud_btl->addr.qp_num_hp = ud_btl->qp_hp->qp_num;
ud_btl->addr.qp_num_lp = ud_btl->qp_lp->qp_num;
OBJ_CONSTRUCT(&ud_btl->ib_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ud_btl->send_free_eager, ompi_free_list_t);
OBJ_CONSTRUCT(&ud_btl->send_free_max, ompi_free_list_t);
@ -609,24 +649,24 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
/* Post receive descriptors */
for(i = 0; i < ud_btl->rd_num; i++) {
/* High Priority (eager) */
OMPI_FREE_LIST_WAIT(&ud_btl->recv_free_eager, item, rc);
frag = (mca_btl_ud_frag_t*)item;
frag->sg_entry.length = frag->size +
sizeof(mca_btl_ud_header_t) +
sizeof(mca_btl_ud_ib_header_t);
sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t);
if(ibv_post_recv(ud_btl->qp_hp,
&frag->wr_desc.rd_desc, &bad_wr)) {
BTL_ERROR(("error posting recv, errno %s\n", strerror(errno)));
return OMPI_ERROR;
}
/* Low Priority (max) */
OMPI_FREE_LIST_WAIT(&ud_btl->recv_free_max, item, rc);
frag = (mca_btl_ud_frag_t*)item;
frag->sg_entry.length = frag->size +
sizeof(mca_btl_ud_header_t) +
sizeof(mca_btl_ud_ib_header_t);
sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t);
if(ibv_post_recv(ud_btl->qp_lp,
&frag->wr_desc.rd_desc, &bad_wr)) {
BTL_ERROR(("error posting recv, errno %s\n", strerror(errno)));
@ -637,6 +677,7 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
return OMPI_SUCCESS;
}
/*
* Dump profiling information
*/
@ -645,7 +686,6 @@ void mca_btl_ud_dump(
struct mca_btl_base_endpoint_t* endpoint,
int verbose)
{
mca_btl_base_dump(btl, endpoint, verbose);
}

Просмотреть файл

@ -103,7 +103,6 @@ struct mca_btl_ud_component_t {
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/
uint32_t ib_pkey_ix;
uint32_t ib_qkey;
uint32_t ib_psn;
uint32_t ib_service_level;
uint32_t ib_src_path_bits;
@ -151,31 +150,40 @@ extern mca_btl_ud_profile_t mca_btl_ud_profile;
struct mca_btl_ud_module_t {
mca_btl_base_module_t super; /**< base PTL interface */
mca_btl_ud_recv_reg_t ib_reg[256];
mca_btl_ud_port_info_t port_info; /* contains only the subnet right now */
uint8_t port_num; /**< ID of the PORT */
struct ibv_device* ib_dev; /* the ib device */
struct ibv_context* ib_dev_context;
struct ibv_pd* ib_pd;
struct ibv_cq* ib_cq_hp;
struct ibv_cq* ib_cq_lp;
struct ibv_port_attr* ib_port_attr;
ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */
ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */
ompi_free_list_t send_free_frag; /**< free list of frags only... used for pining memory */
struct mca_btl_ud_addr_t addr;
/**< local address information */
ompi_free_list_t recv_free_eager; /**< High priority free list of buffer descriptors */
ompi_free_list_t recv_free_max; /**< Low priority free list of buffer descriptors */
ompi_free_list_t send_free_eager;
/**< free list of eager buffer descriptors */
opal_list_t pending_frags_hp;
ompi_free_list_t send_free_max;
/**< free list of max buffer descriptors */
ompi_free_list_t send_free_frag;
/**< free list of frags only... used for pining memory */
ompi_free_list_t recv_free_eager;
/**< High priority free list of buffer descriptors */
ompi_free_list_t recv_free_max;
/**< Low priority free list of buffer descriptors */
opal_list_t pending_frags_hp;
/**< list of pending high priority frags */
opal_list_t pending_frags_lp;
opal_list_t pending_frags_lp;
/**< list of pending low priority frags */
opal_mutex_t ib_lock; /**< module level lock */
opal_mutex_t ib_lock; /**< module level lock */
size_t ib_inline_max; /**< max size of inline send*/
size_t ib_inline_max; /**< max size of inline send*/
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
struct ibv_srq *srq_hp;
@ -194,10 +202,6 @@ struct mca_btl_ud_module_t {
int32_t sd_wqe_hp; /**< number of available send wqe entries */
int32_t sd_wqe_lp; /**< number of available send wqe entries */
uint32_t psn_hp;
uint32_t psn_lp;
/* Local processes port sequence number (Low and High) */
struct ibv_qp* qp_hp;
struct ibv_qp* qp_lp;
/* Local QP (Low and High) */
@ -219,7 +223,7 @@ extern int mca_btl_ud_component_close(void);
/**
* IB component initialization.
*
* @param num_btl_modules (OUT) Number of BTLs returned in BTL array.
* @param num_btl_modules (OUT) Number of BTLs returned in BTL array.
* @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE)
* @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE)
*

Просмотреть файл

@ -163,8 +163,6 @@ int mca_btl_ud_component_open(void)
0, (int*) &mca_btl_ud_component.ib_pkey_ix);
mca_btl_ud_param_register_int("ib_qkey", "IB qkey",
0x01330133, (int*) &mca_btl_ud_component.ib_qkey);
mca_btl_ud_param_register_int("ib_psn", "IB Packet sequence starting number",
0, (int*) &mca_btl_ud_component.ib_psn);
mca_btl_ud_param_register_int("ib_service_level", "IB service level",
0, (int*) &mca_btl_ud_component.ib_service_level);
mca_btl_ud_param_register_int("ib_src_path_bits", "IB source path bits",
@ -221,29 +219,35 @@ int mca_btl_ud_component_close(void)
* will make this available to all peers.
*/
/* TODO - We need to publish an addr_t (formerly rem_info_t) here */
static int
mca_btl_ud_modex_send(void)
{
int rc;
size_t i;
size_t size;
mca_btl_ud_port_info_t *ports = NULL;
mca_btl_ud_addr_t* addrs = NULL;
size = mca_btl_ud_component.ib_num_btls * sizeof (mca_btl_ud_port_info_t);
size = mca_btl_ud_component.ib_num_btls * sizeof(mca_btl_ud_addr_t);
if (size != 0) {
ports = (mca_btl_ud_port_info_t *)malloc (size);
if (NULL == ports) {
addrs = (mca_btl_ud_addr_t *)malloc(size);
if (NULL == addrs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_btl_ud_component.ib_num_btls; i++) {
mca_btl_ud_module_t *btl = &mca_btl_ud_component.ud_btls[i];
ports[i] = btl->port_info;
mca_btl_ud_module_t* btl = &mca_btl_ud_component.ud_btls[i];
addrs[i] = btl->addr;
BTL_VERBOSE(("modex_send HP QP num %d, LP QP num %d, LID = %d",
addrs[i].qp_num_hp,
addrs[i].qp_num_lp,
addrs[i].lid));
}
}
rc = mca_pml_base_modex_send (&mca_btl_ud_component.super.btl_version, ports, size);
if (NULL != ports) {
free (ports);
rc = mca_pml_base_modex_send(&mca_btl_ud_component.super.btl_version, addrs, size);
if(NULL != addrs) {
free (addrs);
}
return rc;
}
@ -346,34 +350,33 @@ mca_btl_base_module_t** mca_btl_ud_component_init(int *num_btl_modules,
/* Note ports are 1 based hence j = 1 */
for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++){
struct ibv_port_attr* ib_port_attr;
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++) {
struct ibv_port_attr ib_port_attr;
if(ibv_query_port(ib_dev_context, (uint8_t)j, &ib_port_attr)) {
BTL_ERROR(("error getting port attributes for device %s port number %d errno says %s",
ibv_get_device_name(ib_dev), j, strerror(errno)));
return NULL;
}
if( IBV_PORT_ACTIVE == ib_port_attr->state ){
ud_btl = (mca_btl_ud_module_t*) malloc(sizeof(mca_btl_ud_module_t));
memcpy(ud_btl, &mca_btl_ud_module, sizeof(mca_btl_ud_module));
if(IBV_PORT_ACTIVE == ib_port_attr.state) {
ud_btl =
(mca_btl_ud_module_t*)malloc(sizeof(mca_btl_ud_module_t));
memcpy(ud_btl, &mca_btl_ud_module, sizeof(mca_btl_ud_module_t));
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) ud_btl;
ib_selected->btl_module = (mca_btl_base_module_t*)ud_btl;
ud_btl->ib_dev = ib_dev;
ud_btl->ib_dev_context = ib_dev_context;
ud_btl->port_num = (uint8_t) j;
ud_btl->ib_port_attr = ib_port_attr;
ud_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */
ud_btl->port_num = (uint8_t)j;
ud_btl->addr.subnet = ib_port_attr.sm_lid;
ud_btl->addr.lid = ib_port_attr.lid;
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
if(++mca_btl_ud_component.ib_num_btls >= mca_btl_ud_component.ib_max_btls)
if(++mca_btl_ud_component.ib_num_btls >=
mca_btl_ud_component.ib_max_btls)
break;
}
else{
free(ib_port_attr);
}
}
}
@ -397,8 +400,9 @@ mca_btl_base_module_t** mca_btl_ud_component_init(int *num_btl_modules,
for(i = 0; i < mca_btl_ud_component.ib_num_btls; i++){
item = opal_list_remove_first(&btl_list);
ib_selected = (mca_btl_base_selected_module_t*)item;
ud_btl = (mca_btl_ud_module_t*) ib_selected->btl_module;
memcpy(&(mca_btl_ud_component.ud_btls[i]), ud_btl, sizeof(mca_btl_ud_module_t));
ud_btl = (mca_btl_ud_module_t*)ib_selected->btl_module;
memcpy(&(mca_btl_ud_component.ud_btls[i]),
ud_btl, sizeof(mca_btl_ud_module_t));
free(ib_selected);
free(ud_btl);
@ -421,8 +425,6 @@ mca_btl_base_module_t** mca_btl_ud_component_init(int *num_btl_modules,
btls[i] = &ud_btl->super;
}
/* Post OOB receive to support dynamic connection setup */
mca_btl_ud_post_recv();
mca_btl_ud_modex_send();
*num_btl_modules = mca_btl_ud_component.ib_num_btls;

Просмотреть файл

@ -69,7 +69,7 @@ inline int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
ib_qp = ud_btl->qp_hp;
frag->wr_desc.sr_desc.wr.ud.ah = endpoint->rmt_ah_hp;
frag->wr_desc.sr_desc.wr.ud.remote_qpn =
endpoint->rem_info.rem_qp_num_hp;
endpoint->rem_addr.qp_num_hp;
if(frag->sg_entry.length <= ud_btl->ib_inline_max) {
frag->wr_desc.sr_desc.send_flags =
@ -86,20 +86,21 @@ inline int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
ib_qp = ud_btl->qp_lp;
frag->wr_desc.sr_desc.wr.ud.ah = endpoint->rmt_ah_lp;
frag->wr_desc.sr_desc.wr.ud.remote_qpn =
endpoint->rem_info.rem_qp_num_lp;
endpoint->rem_addr.qp_num_lp;
}
/*BTL_VERBOSE(("Send to : %d, len : %d %d %d, frag : %p",
endpoint->endpoint_proc->proc_guid.vpid,
/*OPAL_OUTPUT((0, "Send to LID %d QP %d, len: %d %d %d, frag: %p",
endpoint->rem_addr.lid,
frag->wr_desc.sr_desc.wr.ud.remote_qpn,
frag->sg_entry.length, frag->segment.seg_len,
ud_btl->ib_inline_max, frag)); */
ud_btl->ib_inline_max, frag));*/
#if MCA_BTL_UD_ENABLE_PROFILE
frag->tm = opal_sys_timer_get_cycles();
#endif
MCA_BTL_UD_START_TIME(ibv_post_send);
if(OPAL_UNLIKELY(ibv_post_send(ib_qp, &frag->wr_desc.sr_desc, &bad_wr) != 0)) {
if(OPAL_UNLIKELY(ibv_post_send(ib_qp, &frag->wr_desc.sr_desc, &bad_wr))) {
BTL_ERROR(("error posting send request errno says %d %s\n",
errno, strerror(errno)));
return OMPI_ERROR;
@ -122,15 +123,12 @@ OBJ_CLASS_INSTANCE(mca_btl_ud_endpoint_t,
static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
{
endpoint->endpoint_btl = 0;
endpoint->endpoint_proc = 0;
endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
/*OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);*/
memset(&endpoint->rem_info, 0, sizeof(struct mca_btl_ud_rem_info_t));
memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t));
}
/*
* Destroy a endpoint
*
@ -140,325 +138,6 @@ static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
{
}
/*
* Send connection information to remote endpoint using OOB
*
*/
static void mca_btl_ud_endpoint_send_cb(int status, orte_process_name_t* endpoint,
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata)
{
OBJ_RELEASE(buffer);
}
static int mca_btl_ud_endpoint_send_connect_data(mca_btl_ud_endpoint_t* endpoint)
{
mca_btl_ud_module_t* ud_btl = endpoint->endpoint_btl;
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
int rc;
if(NULL == buffer) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the info in the send buffer */
rc = orte_dss.pack(buffer, &ud_btl->qp_hp->qp_num, 1, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.pack(buffer, &ud_btl->qp_lp->qp_num, 1, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.pack(buffer, &ud_btl->psn_hp, 1, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.pack(buffer, &ud_btl->psn_lp, 1, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.pack(buffer, &ud_btl->ib_port_attr->lid, 1, ORTE_UINT16);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.pack(buffer,
&((mca_btl_ud_endpoint_t*)endpoint)->subnet, 1, ORTE_UINT16);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* send to endpoint */
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid,
buffer, ORTE_RML_TAG_DYNAMIC-1, 0, mca_btl_ud_endpoint_send_cb, NULL);
BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
ud_btl->qp_hp->qp_num,
ud_btl->qp_lp->qp_num,
endpoint->endpoint_btl->ib_port_attr->lid));
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
}
return OMPI_SUCCESS;
}
/*
* Non blocking OOB recv callback.
* Read incoming QP and other info, and if this endpoint
* is trying to connect, reply with our QP info,
* otherwise try to modify QP's and establish
* reliable connection
*
*/
static void mca_btl_ud_endpoint_recv(
int status,
orte_process_name_t* endpoint,
orte_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
struct ibv_ah_attr ah_attr;
mca_btl_ud_proc_t *ib_proc;
mca_btl_ud_endpoint_t *ib_endpoint = NULL;
mca_btl_ud_rem_info_t rem_info;
mca_btl_ud_module_t* ud_btl;
int rc;
uint32_t i;
size_t cnt = 1;
/* start by unpacking data first so we know who is knocking at
our door */
rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_hp, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_lp, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dss.unpack(buffer, &rem_info.rem_psn_hp, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dss.unpack(buffer, &rem_info.rem_psn_lp, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dss.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dss.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
/*BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
rem_info.rem_qp_num_hp,
rem_info.rem_qp_num_lp,
rem_info.rem_lid));*/
for(ib_proc = (mca_btl_ud_proc_t*)
opal_list_get_first(&mca_btl_ud_component.ib_procs);
ib_proc != (mca_btl_ud_proc_t*)
opal_list_get_end(&mca_btl_ud_component.ib_procs);
ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) {
if(orte_ns.compare(ORTE_NS_CMP_ALL,
&ib_proc->proc_guid, endpoint) == 0) {
bool found = false;
/* Try to get the endpoint instance of this proc */
for(i = 0; i < ib_proc->proc_endpoint_count; i++) {
ib_endpoint = ib_proc->proc_endpoints[i];
if(ib_endpoint->rem_info.rem_lid &&
ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) {
/* we've seen them before! */
found = true;
break;
}
}
/* If we haven't seen this remote lid before then try to match on
endpoint */
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
ib_endpoint = ib_proc->proc_endpoints[i];
if(!ib_endpoint->rem_info.rem_lid &&
ib_endpoint->subnet == rem_info.rem_subnet) {
/* found a match based on subnet! */
found = true;
break;
}
}
/* try finding an open port, even if subnets don't match */
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
ib_endpoint = ib_proc->proc_endpoints[i];
if(!ib_endpoint->rem_info.rem_lid) {
/* found an unused end-point */
found = true;
break;
}
}
if(!found) {
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
return;
}
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
/* Update status */
if(ib_endpoint->endpoint_state == MCA_BTL_IB_CLOSED) {
if(OMPI_SUCCESS !=
mca_btl_ud_endpoint_send_connect_data(ib_endpoint)) {
BTL_ERROR(("error sending connect request, error code %d", rc));
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
}
/* Always 'CONNECTED' at this point */
ud_btl = ib_endpoint->endpoint_btl;
memcpy(&ib_endpoint->rem_info,
&rem_info, sizeof(mca_btl_ud_rem_info_t));
ah_attr.is_global = 0;
ah_attr.dlid = rem_info.rem_lid;
ah_attr.sl = mca_btl_ud_component.ib_service_level;
ah_attr.src_path_bits = mca_btl_ud_component.ib_src_path_bits;
ah_attr.port_num = ud_btl->port_num;
ib_endpoint->rmt_ah_hp = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
if(NULL == ib_endpoint->rmt_ah_hp) {
BTL_ERROR(("error creating address handle errno says %s\n",
strerror(errno)));
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
ib_endpoint->rmt_ah_lp = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
if(NULL == ib_endpoint) {
BTL_ERROR(("error creating address handle errno says %s\n",
strerror(errno)));
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
return;
}
ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
/*BTL_VERBOSE(("connected! QP num = %d, Low Priority QP num %d, LID = %d",
ib_endpoint->rem_info.rem_qp_num_hp,
ib_endpoint->rem_info.rem_qp_num_lp,
ib_endpoint->rem_info.rem_lid));*/
/* Post our queued sends */
while(!opal_list_is_empty(&(ib_endpoint->pending_send_frags))) {
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)
opal_list_remove_first(&(ib_endpoint->pending_send_frags));
if(OMPI_SUCCESS != mca_btl_ud_endpoint_post_send(
ud_btl, ib_endpoint, frag)) {
BTL_ERROR(("ERROR posting send"));
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
break;
}
}
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
break;
}
}
}
/*
* Post the OOB recv (for receiving the peers information)
*/
void mca_btl_ud_post_recv()
{
orte_rml.recv_buffer_nb(
ORTE_RML_NAME_ANY,
ORTE_RML_TAG_DYNAMIC-1,
ORTE_RML_PERSISTENT,
mca_btl_ud_endpoint_recv,
NULL);
}
/*
* Attempt to send a fragment using a given endpoint. If the endpoint is not
* connected, queue the fragment and start the connection as required.
*/
int mca_btl_ud_endpoint_send(mca_btl_base_endpoint_t* endpoint,
mca_btl_ud_frag_t* frag)
{
int rc = OMPI_SUCCESS;
bool call_progress = false;
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
if(OPAL_LIKELY(endpoint->endpoint_state == MCA_BTL_IB_CONNECTED)) {
MCA_BTL_UD_START_TIME(endpoint_send_conn);
rc = mca_btl_ud_endpoint_post_send(
endpoint->endpoint_btl, endpoint, frag);
MCA_BTL_UD_END_TIME(endpoint_send_conn);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return rc;
}
switch(endpoint->endpoint_state) {
case MCA_BTL_IB_CLOSED:
/* Send connection info over to remote endpoint */
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
rc = mca_btl_ud_endpoint_send_connect_data(endpoint);
if(OMPI_SUCCESS != rc) {
BTL_ERROR(("error sending connect request, error code %d", rc));
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
return rc;
}
/**
* As long as we expect a message from the peer (in order to setup
* the connection) let the event engine pool the OOB events. Note:
* we increment it once per active connection.
*/
opal_progress_event_increment();
call_progress = true;
/* No break here - fall through */
case MCA_BTL_IB_CONNECTING:
opal_list_append(&endpoint->pending_send_frags,
(opal_list_item_t *)frag);
break;
case MCA_BTL_IB_FAILED:
BTL_ERROR(("endpoint FAILED"));
default:
rc = OMPI_ERR_UNREACH;
}
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(call_progress) opal_progress();
return rc;
}
/*
* Create the queue pair note that this is just the initial

Просмотреть файл

@ -41,96 +41,50 @@ OBJ_CLASS_DECLARATION(mca_btl_ud_endpoint_t);
struct mca_btl_ud_frag_t;
struct mca_btl_ud_port_info_t {
uint16_t subnet;
struct mca_btl_ud_addr_t {
uint32_t qp_num_hp;
uint32_t qp_num_lp;
/* QP number (Low and High priority) */
uint32_t psn_hp;
uint32_t psn_lp;
/* Port sequence number (Low and High) */
uint16_t lid;
uint16_t subnet;
/* Local Identifier & Subnet */
};
typedef struct mca_btl_ud_port_info_t mca_btl_ud_port_info_t;
/**
* State of IB endpoint connection.
*/
typedef enum {
/* Defines the state in which this BTL instance
* has started the process of connection */
MCA_BTL_IB_CONNECTING,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
MCA_BTL_IB_CONNECTED,
/* Connection is closed, there are no resources
* associated with this */
MCA_BTL_IB_CLOSED,
/* Maximum number of retries have been used.
* Report failure on send to upper layer */
MCA_BTL_IB_FAILED
} mca_btl_ud_endpoint_state_t;
struct mca_btl_ud_rem_info_t {
uint32_t rem_qp_num_hp;
uint32_t rem_qp_num_lp;
/* Remote QP number (Low and High priority) */
uint16_t rem_lid;
/* Local identifier of the remote process */
uint32_t rem_psn_hp;
uint32_t rem_psn_lp;
/* Remote processes port sequence number (Low and High) */
uint16_t rem_subnet;
/* subnet of remote process */
};
typedef struct mca_btl_ud_rem_info_t mca_btl_ud_rem_info_t;
typedef struct mca_btl_ud_addr_t mca_btl_ud_addr_t;
/**
* An abstraction that represents a connection to a endpoint process.
* An instance of mca_btl_base_endpoint_t is associated w/ each process
* and BTL pair at startup. Normally connections are established as-needed.
* and BTL pair and address information is exchanged at startup.
* The UD BTL is connectionless, so no connection is ever established.
*/
struct mca_btl_base_endpoint_t {
opal_list_item_t super;
struct mca_btl_ud_module_t* endpoint_btl;
/**< BTL instance that created this connection */
struct mca_btl_ud_proc_t* endpoint_proc;
/**< proc structure corresponding to endpoint */
mca_btl_ud_endpoint_state_t endpoint_state;
/**< current state of the connection */
opal_mutex_t endpoint_lock;
/*opal_mutex_t endpoint_lock;*/
/**< lock for concurrent access to endpoint state */
opal_list_t pending_send_frags;
/**< list of pending send frags for this endpoint */
mca_btl_ud_rem_info_t rem_info;
mca_btl_ud_addr_t rem_addr;
/**< Remote address information */
struct ibv_ah* rmt_ah_hp;
struct ibv_ah* rmt_ah_lp;
/* Remote Address Handle (Low and High) */
uint16_t subnet; /**< subnet of this endpoint*/
/**< Remote Address Handle (Low and High) */
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_ud_endpoint_t;
int mca_btl_ud_endpoint_send(mca_btl_base_endpoint_t* endpoint, struct mca_btl_ud_frag_t* frag);
inline int mca_btl_ud_endpoint_post_send(struct mca_btl_ud_module_t* ud_btl,
mca_btl_ud_endpoint_t * endpoint,
struct mca_btl_ud_frag_t * frag);
int mca_btl_ud_endpoint_connect(mca_btl_base_endpoint_t*);
void mca_btl_ud_post_recv(void);
int mca_btl_ud_endpoint_init_qp(
mca_btl_base_module_t* btl,
struct ibv_cq* cq,

Просмотреть файл

@ -18,6 +18,7 @@
* $HEADER$
*/
#include "btl_ud.h"
#include "btl_ud_frag.h"
#include "ompi/mca/mpool/openib/mpool_openib.h"

Просмотреть файл

@ -36,7 +36,7 @@ OBJ_CLASS_INSTANCE(mca_btl_ud_proc_t,
void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_port_count = 0;
proc->proc_addr_count = 0;
proc->proc_endpoints = 0;
proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
@ -111,27 +111,20 @@ mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
return module_proc;
}
/* Oops! First time, gotta create a new IB proc
* out of the ompi_proc ... */
/* Oops! First time, gotta create a new IB proc out of the ompi_proc ... */
module_proc = OBJ_NEW(mca_btl_ud_proc_t);
/* Initialize number of peer */
module_proc->proc_endpoint_count = 0;
module_proc->proc_ompi = ompi_proc;
/* build a unique identifier (of arbitrary
* size) to represent the proc */
/* build a unique identifier (of arbitrary size) to represent the proc */
module_proc->proc_guid = ompi_proc->proc_name;
/* query for the peer address info */
rc = mca_pml_base_modex_recv(
&mca_btl_ud_component.super.btl_version,
ompi_proc,
(void*)&module_proc->proc_ports,
&size
);
rc = mca_pml_base_modex_recv(&mca_btl_ud_component.super.btl_version,
ompi_proc, (void*)&module_proc->proc_addrs,
&size);
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] mca_pml_base_modex_recv failed for peer [%d,%d,%d]",
@ -140,7 +133,7 @@ mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
return NULL;
}
if((size % sizeof(mca_btl_ud_port_info_t)) != 0) {
if((size % sizeof(mca_btl_ud_addr_t)) != 0) {
opal_output(0, "[%s:%d] invalid module address for peer [%d,%d,%d]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
@ -148,14 +141,15 @@ mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
}
module_proc->proc_port_count = size/sizeof(mca_btl_ud_port_info_t);
module_proc->proc_addr_count = size / sizeof(mca_btl_ud_addr_t);
if (0 == module_proc->proc_port_count) {
if (0 == module_proc->proc_addr_count) {
module_proc->proc_endpoints = NULL;
} else {
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
malloc(module_proc->proc_addr_count *
sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == module_proc->proc_endpoints) {
@ -175,7 +169,10 @@ int mca_btl_ud_proc_insert(mca_btl_ud_proc_t* module_proc,
mca_btl_base_endpoint_t* module_endpoint)
{
/* insert into endpoint array */
module_endpoint->endpoint_proc = module_proc;
module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint;
module_endpoint->rem_addr =
module_proc->proc_addrs[module_proc->proc_endpoint_count];
module_proc->proc_endpoints[module_proc->proc_endpoint_count++] =
module_endpoint;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -48,9 +48,9 @@ struct mca_btl_ud_proc_t {
orte_process_name_t proc_guid;
/**< globally unique identifier for the process */
struct mca_btl_ud_port_info_t* proc_ports;
size_t proc_port_count;
/**< number of ports published by endpoint */
struct mca_btl_ud_addr_t* proc_addrs;
size_t proc_addr_count;
/**< number of addresses published by endpoint */
struct mca_btl_base_endpoint_t **proc_endpoints;
/**< array of endpoints that have been created to access this proc */