Add multi-nic support to openib
Fix connection establishment race in openib Other misc This commit was SVN r7570.
Этот коммит содержится в:
родитель
0459678f82
Коммит
67d38b7896
@ -563,7 +563,7 @@ static void mca_btl_mvapi_endpoint_recv(
|
||||
port_info = ib_proc->proc_ports[i];
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(!ib_endpoint->rem_info.rem_lid &&
|
||||
ib_endpoint->rem_info.rem_subnet == rem_info.rem_subnet) {
|
||||
ib_endpoint->subnet == rem_info.rem_subnet) {
|
||||
/* found a match based on subnet! */
|
||||
found = true;
|
||||
break;
|
||||
|
@ -106,6 +106,7 @@ int mca_btl_openib_add_procs(
|
||||
}
|
||||
|
||||
ib_peer->endpoint_btl = openib_btl;
|
||||
ib_peer->subnet = openib_btl->port_info.subnet;
|
||||
rc = mca_btl_openib_proc_insert(ib_proc, ib_peer);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OBJ_RELEASE(ib_peer);
|
||||
@ -360,7 +361,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (unsigned char*) frag->segment.seg_addr.lval + reserve;
|
||||
iov.iov_base = (unsigned char*) (frag->segment.seg_addr.lval + reserve);
|
||||
|
||||
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
|
||||
*size = max_data;
|
||||
|
@ -36,6 +36,7 @@
|
||||
|
||||
#include "mca/btl/btl.h"
|
||||
#include "mca/btl/base/base.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
@ -126,7 +127,7 @@ struct mca_btl_openib_module_t {
|
||||
mca_btl_base_module_t super; /**< base PTL interface */
|
||||
bool btl_inited;
|
||||
mca_btl_openib_recv_reg_t ib_reg[256];
|
||||
|
||||
mca_btl_openib_port_info_t port_info; /* contains only the subnet right now */
|
||||
uint8_t port_num; /**< ID of the PORT */
|
||||
struct ibv_device *ib_dev; /* the ib device */
|
||||
struct ibv_context *ib_dev_context;
|
||||
|
@ -41,6 +41,8 @@
|
||||
#include <errno.h>
|
||||
#include <string.h> /* for strerror()*/
|
||||
|
||||
#include "mca/pml/base/pml_base_module_exchange.h"
|
||||
|
||||
mca_btl_openib_component_t mca_btl_openib_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
@ -225,6 +227,40 @@ int mca_btl_openib_component_close(void)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register MVAPI port information. The MCA framework
|
||||
* will make this available to all peers.
|
||||
*/
|
||||
|
||||
static int
|
||||
mca_btl_openib_modex_send(void)
|
||||
{
|
||||
int rc;
|
||||
size_t i;
|
||||
size_t size;
|
||||
mca_btl_openib_port_info_t *ports = NULL;
|
||||
|
||||
size = mca_btl_openib_component.ib_num_btls * sizeof (mca_btl_openib_port_info_t);
|
||||
if (size != 0) {
|
||||
ports = (mca_btl_openib_port_info_t *)malloc (size);
|
||||
if (NULL == ports) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
mca_btl_openib_module_t *btl = &mca_btl_openib_component.openib_btls[i];
|
||||
ports[i] = btl->port_info;
|
||||
}
|
||||
}
|
||||
rc = mca_pml_base_modex_send (&mca_btl_openib_component.super.btl_version, ports, size);
|
||||
if (NULL != ports) {
|
||||
free (ports);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IB component initialization:
|
||||
* (1) read interface list from kernel and compare against component parameters
|
||||
@ -260,6 +296,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
|
||||
dev_list = ibv_get_devices();
|
||||
if (NULL == dev_list) {
|
||||
mca_btl_base_error_no_nics("OpenIB", "HCA");
|
||||
mca_btl_openib_component.ib_num_btls = 0;
|
||||
mca_btl_openib_modex_send();
|
||||
return NULL;
|
||||
}
|
||||
dlist_start(dev_list);
|
||||
@ -269,6 +307,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
|
||||
|
||||
if(0 == num_devs) {
|
||||
mca_btl_base_error_no_nics("OpenIB", "HCA");
|
||||
mca_btl_openib_modex_send();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -335,7 +374,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
|
||||
openib_btl->ib_dev_context = ib_dev_context;
|
||||
openib_btl->port_num = (uint8_t) j;
|
||||
openib_btl->ib_port_attr = ib_port_attr;
|
||||
|
||||
openib_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */
|
||||
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
|
||||
mca_btl_openib_component.ib_num_btls ++;
|
||||
|
||||
@ -482,6 +521,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
|
||||
|
||||
/* Post OOB receive to support dynamic connection setup */
|
||||
mca_btl_openib_post_recv();
|
||||
mca_btl_openib_modex_send();
|
||||
|
||||
*num_btl_modules = mca_btl_openib_component.ib_num_btls;
|
||||
free(ib_devs);
|
||||
@ -533,7 +573,7 @@ int mca_btl_openib_component_progress()
|
||||
/* Process a RECV */
|
||||
|
||||
BTL_VERBOSE(("Got an recv on the completion queue"));
|
||||
frag = (mca_btl_openib_frag_t*) wc.wr_id;
|
||||
frag = (mca_btl_openib_frag_t*) (void*) wc.wr_id;
|
||||
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
|
||||
frag->rc=OMPI_SUCCESS;
|
||||
frag->segment.seg_len =
|
||||
|
@ -128,8 +128,12 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||
memset(endpoint->lcl_qp_attr_low, 0, sizeof(struct ibv_qp_attr));
|
||||
endpoint->rr_posted_high = 0;
|
||||
endpoint->rr_posted_low = 0;
|
||||
|
||||
|
||||
endpoint->rem_info.rem_qp_num_high = 0;
|
||||
endpoint->rem_info.rem_qp_num_low = 0;
|
||||
endpoint->rem_info.rem_lid = 0;
|
||||
endpoint->rem_info.rem_psn_high = 0;
|
||||
endpoint->rem_info.rem_psn_low = 0;
|
||||
endpoint->rem_info.rem_subnet = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -157,7 +161,7 @@ static void mca_btl_openib_endpoint_send_cb(
|
||||
}
|
||||
|
||||
|
||||
static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* endpoint)
|
||||
static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
|
||||
int rc;
|
||||
@ -191,12 +195,19 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
rc = orte_dps.pack(buffer, &endpoint->endpoint_btl->ib_port_attr->lid, 1, ORTE_UINT16);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
rc = orte_dps.pack(buffer, &((mca_btl_openib_endpoint_t*) endpoint)->subnet, 1, ORTE_UINT16);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -217,91 +228,23 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send connect ACK to remote endpoint
|
||||
*
|
||||
*/
|
||||
|
||||
static int mca_btl_openib_endpoint_send_connect_ack(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
|
||||
int rc;
|
||||
uint32_t zero = 0;
|
||||
|
||||
/* pack the info in the send buffer */
|
||||
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT16))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send to endpoint */
|
||||
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0,
|
||||
mca_btl_openib_endpoint_send_cb, NULL);
|
||||
if(rc < 0) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set remote connection info
|
||||
* (from OOB connection)
|
||||
*
|
||||
*/
|
||||
static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, orte_buffer_t* buffer)
|
||||
static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, mca_btl_openib_rem_info_t* rem_info)
|
||||
{
|
||||
int rc;
|
||||
|
||||
|
||||
|
||||
size_t cnt = 1;
|
||||
rc = orte_dps.unpack(buffer, &endpoint->rem_qp_num_high, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &endpoint->rem_qp_num_low, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &endpoint->rem_psn_high, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &endpoint->rem_psn_low, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &endpoint->rem_lid, &cnt, ORTE_UINT16);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
|
||||
endpoint->rem_qp_num_high,
|
||||
endpoint->rem_qp_num_low,
|
||||
endpoint->rem_lid));
|
||||
|
||||
memcpy(&((mca_btl_openib_endpoint_t*) endpoint)->rem_info, rem_info, sizeof(mca_btl_openib_rem_info_t));
|
||||
|
||||
BTL_VERBOSE(("Setting High Priority QP num = %d, Low Priority QP num %d, LID = %d",
|
||||
endpoint->rem_info.rem_qp_num_high,
|
||||
endpoint->rem_info.rem_qp_num_low,
|
||||
endpoint->rem_info.rem_lid));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -351,7 +294,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
|
||||
|
||||
/* Send connection info over to remote endpoint */
|
||||
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_req(endpoint))) {
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
|
||||
BTL_ERROR(("error sending connect request, error code %d", rc));
|
||||
return rc;
|
||||
}
|
||||
@ -362,7 +305,8 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
|
||||
* Reply to a `start - connect' message
|
||||
*
|
||||
*/
|
||||
static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint, orte_buffer_t* buffer)
|
||||
static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint,
|
||||
mca_btl_openib_rem_info_t *rem_info)
|
||||
{
|
||||
int rc;
|
||||
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl;
|
||||
@ -398,7 +342,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
|
||||
|
||||
|
||||
/* Set the remote side info */
|
||||
mca_btl_openib_endpoint_set_remote_info(endpoint, buffer);
|
||||
mca_btl_openib_endpoint_set_remote_info(endpoint, rem_info);
|
||||
|
||||
/* Connect to endpoint */
|
||||
|
||||
@ -409,13 +353,22 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
|
||||
}
|
||||
|
||||
/* Send connection info over to remote endpoint */
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_req(endpoint))) {
|
||||
endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
|
||||
BTL_ERROR(("error in endpoint send connect request error code is %d", rc));
|
||||
return rc;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* endpoint is waiting ack to final connection establishment..
|
||||
*/
|
||||
|
||||
static void mca_btl_openib_endpoint_waiting_ack(mca_btl_openib_endpoint_t *endpoint) {
|
||||
endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK;
|
||||
}
|
||||
|
||||
/*
|
||||
* called when the openib has completed setup via the
|
||||
* OOB channel
|
||||
@ -423,9 +376,25 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
|
||||
|
||||
static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
{
|
||||
opal_list_item_t *frag_item;
|
||||
mca_btl_openib_frag_t *frag;
|
||||
mca_btl_openib_module_t* openib_btl;
|
||||
|
||||
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
|
||||
endpoint->endpoint_btl->poll_cq = true;
|
||||
mca_btl_openib_progress_send_frags(endpoint);
|
||||
|
||||
/* While there are frags in the list,
|
||||
* process them */
|
||||
|
||||
while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
|
||||
frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
|
||||
frag = (mca_btl_openib_frag_t *) frag_item;
|
||||
openib_btl = endpoint->endpoint_btl;
|
||||
/* We need to post this one */
|
||||
|
||||
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
|
||||
BTL_ERROR(("Error posting send"));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -448,6 +417,78 @@ static void mca_btl_openib_endpoint_recv(
|
||||
mca_btl_openib_endpoint_t *ib_endpoint;
|
||||
int endpoint_state;
|
||||
int rc;
|
||||
uint32_t i;
|
||||
size_t cnt = 1;
|
||||
mca_btl_openib_rem_info_t rem_info;
|
||||
|
||||
ib_endpoint = (mca_btl_openib_endpoint_t*) endpoint;
|
||||
/* start by unpacking data first so we know who is knocking at
|
||||
our door */
|
||||
|
||||
rc = orte_dps.unpack(buffer, &rem_info.rem_qp_num_high, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &rem_info.rem_qp_num_low, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &rem_info.rem_psn_high, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}rc = orte_dps.unpack(buffer, &rem_info.rem_psn_low, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dps.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
#if 0
|
||||
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->r_key, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->rem_base, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->rem_size, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->rem_cnt, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
|
||||
rem_info.rem_qp_num_high,
|
||||
rem_info.rem_qp_num_low,
|
||||
rem_info.rem_lid));
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for(ib_proc = (mca_btl_openib_proc_t*)
|
||||
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
||||
@ -456,14 +497,54 @@ static void mca_btl_openib_endpoint_recv(
|
||||
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
|
||||
if(ib_proc->proc_guid.vpid == endpoint->vpid) {
|
||||
|
||||
bool found = false;
|
||||
|
||||
/* Try to get the endpoint instance of this proc */
|
||||
|
||||
/* Limitation: Right now, we have only 1 endpoint
|
||||
* for every process. Need several changes, some
|
||||
* in PML/BTL interface to set this right */
|
||||
for(i = 0; i < ib_proc->proc_endpoint_count; i++) {
|
||||
mca_btl_openib_port_info_t port_info;
|
||||
port_info = ib_proc->proc_ports[i];
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(ib_endpoint->rem_info.rem_lid &&
|
||||
ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) {
|
||||
/* we've seen them before! */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* If we haven't seen this remote lid before then try to match on
|
||||
endpoint */
|
||||
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
|
||||
mca_btl_openib_port_info_t port_info;
|
||||
port_info = ib_proc->proc_ports[i];
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(!ib_endpoint->rem_info.rem_lid &&
|
||||
ib_endpoint->subnet == rem_info.rem_subnet) {
|
||||
/* found a match based on subnet! */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* try finding an open port, even if subnets
|
||||
don't match
|
||||
*/
|
||||
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
|
||||
mca_btl_openib_port_info_t port_info;
|
||||
port_info = ib_proc->proc_ports[i];
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(!ib_endpoint->rem_info.rem_lid) {
|
||||
/* found an unused end-point */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!found) {
|
||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||
return;
|
||||
}
|
||||
ib_endpoint = ib_proc->proc_endpoints[0];
|
||||
|
||||
|
||||
endpoint_state = ib_endpoint->endpoint_state;
|
||||
|
||||
/* Update status */
|
||||
@ -474,37 +555,38 @@ static void mca_btl_openib_endpoint_recv(
|
||||
* status of this connection to CONNECTING,
|
||||
* and then reply with our QP information */
|
||||
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, buffer))) {
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, &rem_info))) {
|
||||
BTL_ERROR(("error in endpoint reply start connect"));
|
||||
break;
|
||||
}
|
||||
|
||||
/* Setup state as connected */
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
|
||||
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECTING :
|
||||
|
||||
mca_btl_openib_endpoint_set_remote_info(ib_endpoint, buffer);
|
||||
mca_btl_openib_endpoint_set_remote_info(ib_endpoint, &rem_info);
|
||||
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) {
|
||||
BTL_ERROR(("endpoint connect error: %d", rc));
|
||||
break;
|
||||
}
|
||||
|
||||
/* Setup state as connected */
|
||||
mca_btl_openib_endpoint_connected(ib_endpoint);
|
||||
/* Setup state as awaiting ack from peer */
|
||||
mca_btl_openib_endpoint_waiting_ack(ib_endpoint);
|
||||
|
||||
/* Send him an ack */
|
||||
mca_btl_openib_endpoint_send_connect_ack(ib_endpoint);
|
||||
mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
|
||||
break;
|
||||
|
||||
|
||||
case MCA_BTL_IB_WAITING_ACK:
|
||||
mca_btl_openib_endpoint_connected(ib_endpoint);
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECT_ACK:
|
||||
|
||||
mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
|
||||
mca_btl_openib_endpoint_connected(ib_endpoint);
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECTED :
|
||||
|
||||
break;
|
||||
default :
|
||||
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
|
||||
@ -561,7 +643,7 @@ int mca_btl_openib_endpoint_send(
|
||||
break;
|
||||
|
||||
case MCA_BTL_IB_CONNECT_ACK:
|
||||
|
||||
case MCA_BTL_IB_WAITING_ACK:
|
||||
BTL_VERBOSE(("Queuing because waiting for ack"));
|
||||
|
||||
opal_list_append(&endpoint->pending_send_frags,
|
||||
@ -586,10 +668,10 @@ int mca_btl_openib_endpoint_send(
|
||||
case MCA_BTL_IB_CONNECTED:
|
||||
{
|
||||
openib_btl = endpoint->endpoint_btl;
|
||||
BTL_VERBOSE(("Send to : %d, len : %lu, frag : %llu",
|
||||
BTL_VERBOSE(("Send to : %d, len : %lu, frag : %p",
|
||||
endpoint->endpoint_proc->proc_guid.vpid,
|
||||
frag->sg_entry.length,
|
||||
(unsigned long long) frag));
|
||||
frag));
|
||||
rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag);
|
||||
break;
|
||||
}
|
||||
@ -602,36 +684,6 @@ int mca_btl_openib_endpoint_send(
|
||||
|
||||
return rc;
|
||||
}
|
||||
/*
|
||||
* Progress send frags, any frags that have been queued up before the connection
|
||||
* was established need to be progressed.
|
||||
*/
|
||||
void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t* endpoint)
|
||||
{
|
||||
opal_list_item_t *frag_item;
|
||||
mca_btl_openib_frag_t *frag;
|
||||
mca_btl_openib_module_t* openib_btl;
|
||||
/*Check if endpoint is connected */
|
||||
if(endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* While there are frags in the list,
|
||||
* process them */
|
||||
|
||||
while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
|
||||
frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
|
||||
frag = (mca_btl_openib_frag_t *) frag_item;
|
||||
openib_btl = endpoint->endpoint_btl;
|
||||
/* We need to post this one */
|
||||
|
||||
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
|
||||
BTL_ERROR(("Error posting send"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Complete connection to endpoint.
|
||||
@ -649,9 +701,9 @@ int mca_btl_openib_endpoint_connect(
|
||||
endpoint->lcl_qp_high,
|
||||
endpoint->lcl_qp_attr_high,
|
||||
endpoint->lcl_psn_high,
|
||||
endpoint->rem_qp_num_high,
|
||||
endpoint->rem_psn_high,
|
||||
endpoint->rem_lid,
|
||||
endpoint->rem_info.rem_qp_num_high,
|
||||
endpoint->rem_info.rem_psn_high,
|
||||
endpoint->rem_info.rem_lid,
|
||||
openib_btl->port_num
|
||||
);
|
||||
|
||||
@ -665,9 +717,9 @@ int mca_btl_openib_endpoint_connect(
|
||||
endpoint->lcl_qp_low,
|
||||
endpoint->lcl_qp_attr_low,
|
||||
endpoint->lcl_psn_low,
|
||||
endpoint->rem_qp_num_low,
|
||||
endpoint->rem_psn_low,
|
||||
endpoint->rem_lid,
|
||||
endpoint->rem_info.rem_qp_num_low,
|
||||
endpoint->rem_info.rem_psn_low,
|
||||
endpoint->rem_info.rem_lid,
|
||||
openib_btl->port_num
|
||||
);
|
||||
|
||||
|
@ -33,6 +33,15 @@ extern "C" {
|
||||
#endif
|
||||
OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t);
|
||||
|
||||
|
||||
struct mca_btl_openib_frag_t;
|
||||
|
||||
struct mca_btl_openib_port_info_t {
|
||||
uint16_t subnet;
|
||||
};
|
||||
typedef struct mca_btl_openib_port_info_t mca_btl_openib_port_info_t;
|
||||
|
||||
|
||||
/**
|
||||
* State of IB endpoint connection.
|
||||
*/
|
||||
@ -44,6 +53,9 @@ typedef enum {
|
||||
|
||||
/* Waiting for ack from endpoint */
|
||||
MCA_BTL_IB_CONNECT_ACK,
|
||||
|
||||
/*Waiting for final connection ACK from endpoint */
|
||||
MCA_BTL_IB_WAITING_ACK,
|
||||
|
||||
/* Connected ... both sender & receiver have
|
||||
* buffers associated with this connection */
|
||||
@ -58,6 +70,29 @@ typedef enum {
|
||||
MCA_BTL_IB_FAILED
|
||||
} mca_btl_openib_endpoint_state_t;
|
||||
|
||||
struct mca_btl_openib_rem_info_t {
|
||||
|
||||
uint32_t rem_qp_num_high;
|
||||
uint32_t rem_qp_num_low;
|
||||
/* Remote QP number (Low and High priority) */
|
||||
|
||||
uint16_t rem_lid;
|
||||
/* Local identifier of the remote process */
|
||||
|
||||
|
||||
uint32_t rem_psn_high;
|
||||
uint32_t rem_psn_low;
|
||||
/* Remote processes port sequence number (Low and High) */
|
||||
|
||||
uint16_t rem_subnet;
|
||||
/* subnet of remote process */
|
||||
|
||||
|
||||
};
|
||||
typedef struct mca_btl_openib_rem_info_t mca_btl_openib_rem_info_t;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* An abstraction that represents a connection to a endpoint process.
|
||||
* An instance of mca_btl_base_endpoint_t is associated w/ each process
|
||||
@ -92,17 +127,7 @@ struct mca_btl_base_endpoint_t {
|
||||
opal_list_t pending_send_frags;
|
||||
/**< list of pending send frags for this endpotint */
|
||||
|
||||
uint32_t rem_qp_num_high;
|
||||
uint32_t rem_qp_num_low;
|
||||
/* Remote QP number (Low and High priority) */
|
||||
|
||||
uint16_t rem_lid;
|
||||
/* Local identifier of the remote process */
|
||||
|
||||
|
||||
uint32_t rem_psn_high;
|
||||
uint32_t rem_psn_low;
|
||||
/* Remote processes port sequence number (Low and High) */
|
||||
mca_btl_openib_rem_info_t rem_info;
|
||||
|
||||
uint32_t lcl_psn_high;
|
||||
uint32_t lcl_psn_low;
|
||||
@ -119,7 +144,7 @@ struct mca_btl_base_endpoint_t {
|
||||
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
|
||||
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
|
||||
|
||||
|
||||
uint16_t subnet; /**< subnet of this endpoint*/
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
@ -130,8 +155,6 @@ int mca_btl_openib_endpoint_connect(mca_btl_base_endpoint_t*);
|
||||
void mca_btl_openib_post_recv(void);
|
||||
|
||||
|
||||
void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t*);
|
||||
|
||||
|
||||
#define MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(post_rr_high_endpoint, \
|
||||
post_rr_high_additional) \
|
||||
|
@ -61,7 +61,7 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
|
||||
frag->wr_desc.sr_desc.wr_id = (uint64_t) frag;
|
||||
frag->wr_desc.sr_desc.wr_id = (unsigned long) frag;
|
||||
frag->wr_desc.sr_desc.sg_list = &frag->sg_entry;
|
||||
frag->wr_desc.sr_desc.num_sge = 1;
|
||||
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
|
||||
@ -78,7 +78,7 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f
|
||||
frag->base.des_src = NULL;
|
||||
frag->base.des_src_cnt = 0;
|
||||
|
||||
frag->wr_desc.rr_desc.wr_id = (uint64_t) frag;
|
||||
frag->wr_desc.rr_desc.wr_id = (unsigned long) frag;
|
||||
frag->wr_desc.rr_desc.sg_list = &frag->sg_entry;
|
||||
frag->wr_desc.rr_desc.num_sge = 1;
|
||||
frag->wr_desc.rr_desc.next = NULL;
|
||||
|
@ -32,7 +32,7 @@ OBJ_CLASS_INSTANCE(mca_btl_openib_proc_t,
|
||||
void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* proc)
|
||||
{
|
||||
proc->proc_ompi = 0;
|
||||
proc->proc_addr_count = 0;
|
||||
proc->proc_port_count = 0;
|
||||
proc->proc_endpoints = 0;
|
||||
proc->proc_endpoint_count = 0;
|
||||
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
|
||||
@ -95,7 +95,9 @@ static mca_btl_openib_proc_t* mca_btl_openib_proc_lookup_ompi(ompi_proc_t* ompi_
|
||||
mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
{
|
||||
mca_btl_openib_proc_t* module_proc = NULL;
|
||||
|
||||
size_t size;
|
||||
int rc;
|
||||
|
||||
/* Check if we have already created a IB proc
|
||||
* structure for this ompi process */
|
||||
module_proc = mca_btl_openib_proc_lookup_ompi(ompi_proc);
|
||||
@ -116,19 +118,42 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
* size) to represent the proc */
|
||||
module_proc->proc_guid = ompi_proc->proc_name;
|
||||
|
||||
/* IB module doesn't have addresses exported at
|
||||
* initialization, so the addr_count is set to one. */
|
||||
module_proc->proc_addr_count = 1;
|
||||
|
||||
/* query for the peer address info */
|
||||
rc = mca_pml_base_modex_recv(
|
||||
&mca_btl_openib_component.super.btl_version,
|
||||
ompi_proc,
|
||||
(void*)&module_proc->proc_ports,
|
||||
&size
|
||||
);
|
||||
|
||||
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0, "[%s:%d] mca_pml_base_modex_recv failed for peer [%d,%d,%d]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_openib_port_info_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid module address for peer [%d,%d,%d]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* XXX: Right now, there can be only 1 peer associated
|
||||
* with a proc. Needs a little bit change in
|
||||
* mca_btl_openib_proc_t to allow on demand increasing of
|
||||
* number of endpoints for this proc */
|
||||
|
||||
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
||||
malloc(module_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*));
|
||||
|
||||
module_proc->proc_port_count = size/sizeof(mca_btl_openib_port_info_t);
|
||||
|
||||
|
||||
if (0 == module_proc->proc_port_count) {
|
||||
module_proc->proc_endpoints = NULL;
|
||||
} else {
|
||||
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
||||
malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
|
||||
}
|
||||
|
||||
if(NULL == module_proc->proc_endpoints) {
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
|
@ -43,9 +43,10 @@ struct mca_btl_openib_proc_t {
|
||||
|
||||
orte_process_name_t proc_guid;
|
||||
/**< globally unique identifier for the process */
|
||||
|
||||
size_t proc_addr_count;
|
||||
/**< number of addresses published by endpoint */
|
||||
|
||||
struct mca_btl_openib_port_info_t* proc_ports;
|
||||
size_t proc_port_count;
|
||||
/**< number of ports published by endpoint */
|
||||
|
||||
struct mca_btl_base_endpoint_t **proc_endpoints;
|
||||
/**< array of endpoints that have been created to access this proc */
|
||||
|
@ -66,8 +66,8 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata)
|
||||
for(i = 0; i < cnt; i++) {
|
||||
|
||||
reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(®s, i);
|
||||
if(base_addr < (void*) reg->bound) {
|
||||
base_addr = down_align_addr( reg->bound, mca_mpool_base_page_size_log );
|
||||
if(base_addr < (void*) ((unsigned long) reg->bound - mca_mpool_base_page_size + 1)) {
|
||||
base_addr = (reg->bound - mca_mpool_base_page_size + 1);
|
||||
}
|
||||
if(reg->flags & MCA_MPOOL_FLAGS_CACHE) {
|
||||
assert(reg->ref_count <= 3);
|
||||
|
@ -90,7 +90,7 @@ int mca_mpool_gm_register(
|
||||
reg->mpool = mpool;
|
||||
reg->base = down_align_addr(addr, mca_mpool_base_page_size_log);
|
||||
reg->flags = flags;
|
||||
reg->bound = up_align_addr(addr + size -1
|
||||
reg->bound = up_align_addr((void*) ((unsigned long) addr + size -1)
|
||||
, mca_mpool_base_page_size_log);
|
||||
|
||||
|
||||
|
@ -96,13 +96,13 @@ size_t mca_pml_ob1_rdma_btls(
|
||||
unsigned char* new_bound = reg->bound > (base + size - 1) ? reg->bound : (base + size - 1);
|
||||
size_t new_len = new_bound - new_base + 1;
|
||||
|
||||
/* printf("re-re5Bg 2: base %p size %d new_base %p new_len %d\n", base, size, new_base, new_len); */
|
||||
/* printf("re-reg 2: base %p size %d new_base %p new_len %d\n", base, size, new_base, new_len); */
|
||||
assert(new_len >= size);
|
||||
reg_temp = *reg;
|
||||
btl_mpool->mpool_deregister(btl_mpool, reg);
|
||||
btl_mpool->mpool_register(btl_mpool,
|
||||
new_base,
|
||||
new_len,
|
||||
base,
|
||||
size,
|
||||
MCA_MPOOL_FLAGS_CACHE,
|
||||
®);
|
||||
|
||||
@ -268,8 +268,8 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
|
||||
btl_mpool->mpool_deregister(btl_mpool, largest);
|
||||
assert(new_len >= size);
|
||||
btl_mpool->mpool_register(btl_mpool,
|
||||
new_base,
|
||||
new_len,
|
||||
base,
|
||||
size,
|
||||
MCA_MPOOL_FLAGS_CACHE,
|
||||
&fit);
|
||||
assert(fit->ref_count >= 3);
|
||||
|
@ -45,11 +45,11 @@ int mca_rcache_rb_mru_insert(
|
||||
* the tree and mru list. memory will be deregistered when
|
||||
* the reference count goes to zero.
|
||||
*/
|
||||
/* old_reg = (mca_mpool_base_registration_t*) */
|
||||
/* opal_list_get_first(&rcache->mru_list); */
|
||||
/* old_reg->mpool->mpool_retain(old_reg->mpool, old_reg); */
|
||||
/* old_reg->mpool->mpool_deregister(old_reg->mpool, old_reg); */
|
||||
|
||||
old_reg = (mca_mpool_base_registration_t*)
|
||||
opal_list_get_first(&rcache->mru_list);
|
||||
old_reg->mpool->mpool_retain(old_reg->mpool, old_reg);
|
||||
old_reg->mpool->mpool_deregister(old_reg->mpool, old_reg);
|
||||
|
||||
}
|
||||
opal_list_append(&rcache->mru_list,(opal_list_item_t*) reg);
|
||||
return OMPI_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user