1
1

Add multi-nic support to openib

Fix connection establishment race in openib 
Other misc 

This commit was SVN r7570.
Этот коммит содержится в:
Galen Shipman 2005-09-30 22:58:09 +00:00
родитель 0459678f82
Коммит 67d38b7896
13 изменённых файлов: 334 добавлений и 191 удалений

Просмотреть файл

@ -563,7 +563,7 @@ static void mca_btl_mvapi_endpoint_recv(
port_info = ib_proc->proc_ports[i];
ib_endpoint = ib_proc->proc_endpoints[i];
if(!ib_endpoint->rem_info.rem_lid &&
ib_endpoint->rem_info.rem_subnet == rem_info.rem_subnet) {
ib_endpoint->subnet == rem_info.rem_subnet) {
/* found a match based on subnet! */
found = true;
break;

Просмотреть файл

@ -106,6 +106,7 @@ int mca_btl_openib_add_procs(
}
ib_peer->endpoint_btl = openib_btl;
ib_peer->subnet = openib_btl->port_info.subnet;
rc = mca_btl_openib_proc_insert(ib_proc, ib_peer);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(ib_peer);
@ -360,7 +361,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segment.seg_addr.lval + reserve;
iov.iov_base = (unsigned char*) (frag->segment.seg_addr.lval + reserve);
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
*size = max_data;

Просмотреть файл

@ -36,6 +36,7 @@
#include "mca/btl/btl.h"
#include "mca/btl/base/base.h"
#include "btl_openib_endpoint.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
@ -126,7 +127,7 @@ struct mca_btl_openib_module_t {
mca_btl_base_module_t super; /**< base PTL interface */
bool btl_inited;
mca_btl_openib_recv_reg_t ib_reg[256];
mca_btl_openib_port_info_t port_info; /* contains only the subnet right now */
uint8_t port_num; /**< ID of the PORT */
struct ibv_device *ib_dev; /* the ib device */
struct ibv_context *ib_dev_context;

Просмотреть файл

@ -41,6 +41,8 @@
#include <errno.h>
#include <string.h> /* for strerror()*/
#include "mca/pml/base/pml_base_module_exchange.h"
mca_btl_openib_component_t mca_btl_openib_component = {
{
/* First, the mca_base_component_t struct containing meta information
@ -225,6 +227,40 @@ int mca_btl_openib_component_close(void)
return OMPI_SUCCESS;
}
/*
* Register MVAPI port information. The MCA framework
* will make this available to all peers.
*/
static int
mca_btl_openib_modex_send(void)
{
int rc;
size_t i;
size_t size;
mca_btl_openib_port_info_t *ports = NULL;
size = mca_btl_openib_component.ib_num_btls * sizeof (mca_btl_openib_port_info_t);
if (size != 0) {
ports = (mca_btl_openib_port_info_t *)malloc (size);
if (NULL == ports) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
mca_btl_openib_module_t *btl = &mca_btl_openib_component.openib_btls[i];
ports[i] = btl->port_info;
}
}
rc = mca_pml_base_modex_send (&mca_btl_openib_component.super.btl_version, ports, size);
if (NULL != ports) {
free (ports);
}
return rc;
}
/*
* IB component initialization:
* (1) read interface list from kernel and compare against component parameters
@ -260,6 +296,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
dev_list = ibv_get_devices();
if (NULL == dev_list) {
mca_btl_base_error_no_nics("OpenIB", "HCA");
mca_btl_openib_component.ib_num_btls = 0;
mca_btl_openib_modex_send();
return NULL;
}
dlist_start(dev_list);
@ -269,6 +307,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
if(0 == num_devs) {
mca_btl_base_error_no_nics("OpenIB", "HCA");
mca_btl_openib_modex_send();
return NULL;
}
@ -335,7 +374,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
openib_btl->ib_dev_context = ib_dev_context;
openib_btl->port_num = (uint8_t) j;
openib_btl->ib_port_attr = ib_port_attr;
openib_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
mca_btl_openib_component.ib_num_btls ++;
@ -482,6 +521,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
/* Post OOB receive to support dynamic connection setup */
mca_btl_openib_post_recv();
mca_btl_openib_modex_send();
*num_btl_modules = mca_btl_openib_component.ib_num_btls;
free(ib_devs);
@ -533,7 +573,7 @@ int mca_btl_openib_component_progress()
/* Process a RECV */
BTL_VERBOSE(("Got an recv on the completion queue"));
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag = (mca_btl_openib_frag_t*) (void*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =

Просмотреть файл

@ -128,8 +128,12 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
memset(endpoint->lcl_qp_attr_low, 0, sizeof(struct ibv_qp_attr));
endpoint->rr_posted_high = 0;
endpoint->rr_posted_low = 0;
endpoint->rem_info.rem_qp_num_high = 0;
endpoint->rem_info.rem_qp_num_low = 0;
endpoint->rem_info.rem_lid = 0;
endpoint->rem_info.rem_psn_high = 0;
endpoint->rem_info.rem_psn_low = 0;
endpoint->rem_info.rem_subnet = 0;
}
/*
@ -157,7 +161,7 @@ static void mca_btl_openib_endpoint_send_cb(
}
static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* endpoint)
static int mca_btl_openib_endpoint_send_connect_data(mca_btl_base_endpoint_t* endpoint)
{
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
int rc;
@ -191,12 +195,19 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.pack(buffer, &endpoint->endpoint_btl->ib_port_attr->lid, 1, ORTE_UINT16);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.pack(buffer, &((mca_btl_openib_endpoint_t*) endpoint)->subnet, 1, ORTE_UINT16);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -217,91 +228,23 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end
return OMPI_SUCCESS;
}
/*
* Send connect ACK to remote endpoint
*
*/
static int mca_btl_openib_endpoint_send_connect_ack(mca_btl_base_endpoint_t* endpoint)
{
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
int rc;
uint32_t zero = 0;
/* pack the info in the send buffer */
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT16))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* send to endpoint */
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0,
mca_btl_openib_endpoint_send_cb, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/*
* Set remote connection info
* (from OOB connection)
*
*/
static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, orte_buffer_t* buffer)
static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, mca_btl_openib_rem_info_t* rem_info)
{
int rc;
size_t cnt = 1;
rc = orte_dps.unpack(buffer, &endpoint->rem_qp_num_high, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &endpoint->rem_qp_num_low, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &endpoint->rem_psn_high, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &endpoint->rem_psn_low, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &endpoint->rem_lid, &cnt, ORTE_UINT16);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
endpoint->rem_qp_num_high,
endpoint->rem_qp_num_low,
endpoint->rem_lid));
memcpy(&((mca_btl_openib_endpoint_t*) endpoint)->rem_info, rem_info, sizeof(mca_btl_openib_rem_info_t));
BTL_VERBOSE(("Setting High Priority QP num = %d, Low Priority QP num %d, LID = %d",
endpoint->rem_info.rem_qp_num_high,
endpoint->rem_info.rem_qp_num_low,
endpoint->rem_info.rem_lid));
return ORTE_SUCCESS;
}
@ -351,7 +294,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
/* Send connection info over to remote endpoint */
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_req(endpoint))) {
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
BTL_ERROR(("error sending connect request, error code %d", rc));
return rc;
}
@ -362,7 +305,8 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
* Reply to a `start - connect' message
*
*/
static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint, orte_buffer_t* buffer)
static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_rem_info_t *rem_info)
{
int rc;
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl;
@ -398,7 +342,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
/* Set the remote side info */
mca_btl_openib_endpoint_set_remote_info(endpoint, buffer);
mca_btl_openib_endpoint_set_remote_info(endpoint, rem_info);
/* Connect to endpoint */
@ -409,13 +353,22 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
}
/* Send connection info over to remote endpoint */
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_req(endpoint))) {
endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_send_connect_data(endpoint))) {
BTL_ERROR(("error in endpoint send connect request error code is %d", rc));
return rc;
}
return OMPI_SUCCESS;
}
/*
* endpoint is waiting ack to final connection establishment..
*/
static void mca_btl_openib_endpoint_waiting_ack(mca_btl_openib_endpoint_t *endpoint) {
endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK;
}
/*
* called when the openib has completed setup via the
* OOB channel
@ -423,9 +376,25 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t *frag;
mca_btl_openib_module_t* openib_btl;
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
endpoint->endpoint_btl->poll_cq = true;
mca_btl_openib_progress_send_frags(endpoint);
/* While there are frags in the list,
* process them */
while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
frag = (mca_btl_openib_frag_t *) frag_item;
openib_btl = endpoint->endpoint_btl;
/* We need to post this one */
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
BTL_ERROR(("Error posting send"));
}
}
/*
@ -448,6 +417,78 @@ static void mca_btl_openib_endpoint_recv(
mca_btl_openib_endpoint_t *ib_endpoint;
int endpoint_state;
int rc;
uint32_t i;
size_t cnt = 1;
mca_btl_openib_rem_info_t rem_info;
ib_endpoint = (mca_btl_openib_endpoint_t*) endpoint;
/* start by unpacking data first so we know who is knocking at
our door */
rc = orte_dps.unpack(buffer, &rem_info.rem_qp_num_high, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dps.unpack(buffer, &rem_info.rem_qp_num_low, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dps.unpack(buffer, &rem_info.rem_psn_high, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}rc = orte_dps.unpack(buffer, &rem_info.rem_psn_low, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dps.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
rc = orte_dps.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return;
}
#if 0
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->r_key, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->rem_base, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->rem_size, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &ib_endpoint->rdma_buf->rem_cnt, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
rem_info.rem_qp_num_high,
rem_info.rem_qp_num_low,
rem_info.rem_lid));
for(ib_proc = (mca_btl_openib_proc_t*)
opal_list_get_first(&mca_btl_openib_component.ib_procs);
@ -456,14 +497,54 @@ static void mca_btl_openib_endpoint_recv(
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
if(ib_proc->proc_guid.vpid == endpoint->vpid) {
bool found = false;
/* Try to get the endpoint instance of this proc */
/* Limitation: Right now, we have only 1 endpoint
* for every process. Need several changes, some
* in PML/BTL interface to set this right */
for(i = 0; i < ib_proc->proc_endpoint_count; i++) {
mca_btl_openib_port_info_t port_info;
port_info = ib_proc->proc_ports[i];
ib_endpoint = ib_proc->proc_endpoints[i];
if(ib_endpoint->rem_info.rem_lid &&
ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) {
/* we've seen them before! */
found = true;
break;
}
}
/* If we haven't seen this remote lid before then try to match on
endpoint */
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
mca_btl_openib_port_info_t port_info;
port_info = ib_proc->proc_ports[i];
ib_endpoint = ib_proc->proc_endpoints[i];
if(!ib_endpoint->rem_info.rem_lid &&
ib_endpoint->subnet == rem_info.rem_subnet) {
/* found a match based on subnet! */
found = true;
break;
}
}
/* try finding an open port, even if subnets
don't match
*/
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
mca_btl_openib_port_info_t port_info;
port_info = ib_proc->proc_ports[i];
ib_endpoint = ib_proc->proc_endpoints[i];
if(!ib_endpoint->rem_info.rem_lid) {
/* found an unused end-point */
found = true;
break;
}
}
if(!found) {
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
return;
}
ib_endpoint = ib_proc->proc_endpoints[0];
endpoint_state = ib_endpoint->endpoint_state;
/* Update status */
@ -474,37 +555,38 @@ static void mca_btl_openib_endpoint_recv(
* status of this connection to CONNECTING,
* and then reply with our QP information */
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, buffer))) {
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, &rem_info))) {
BTL_ERROR(("error in endpoint reply start connect"));
break;
}
/* Setup state as connected */
ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
break;
case MCA_BTL_IB_CONNECTING :
mca_btl_openib_endpoint_set_remote_info(ib_endpoint, buffer);
mca_btl_openib_endpoint_set_remote_info(ib_endpoint, &rem_info);
if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) {
BTL_ERROR(("endpoint connect error: %d", rc));
break;
}
/* Setup state as connected */
mca_btl_openib_endpoint_connected(ib_endpoint);
/* Setup state as awaiting ack from peer */
mca_btl_openib_endpoint_waiting_ack(ib_endpoint);
/* Send him an ack */
mca_btl_openib_endpoint_send_connect_ack(ib_endpoint);
mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
break;
case MCA_BTL_IB_WAITING_ACK:
mca_btl_openib_endpoint_connected(ib_endpoint);
break;
case MCA_BTL_IB_CONNECT_ACK:
mca_btl_openib_endpoint_send_connect_data(ib_endpoint);
mca_btl_openib_endpoint_connected(ib_endpoint);
break;
case MCA_BTL_IB_CONNECTED :
break;
default :
BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
@ -561,7 +643,7 @@ int mca_btl_openib_endpoint_send(
break;
case MCA_BTL_IB_CONNECT_ACK:
case MCA_BTL_IB_WAITING_ACK:
BTL_VERBOSE(("Queuing because waiting for ack"));
opal_list_append(&endpoint->pending_send_frags,
@ -586,10 +668,10 @@ int mca_btl_openib_endpoint_send(
case MCA_BTL_IB_CONNECTED:
{
openib_btl = endpoint->endpoint_btl;
BTL_VERBOSE(("Send to : %d, len : %lu, frag : %llu",
BTL_VERBOSE(("Send to : %d, len : %lu, frag : %p",
endpoint->endpoint_proc->proc_guid.vpid,
frag->sg_entry.length,
(unsigned long long) frag));
frag));
rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag);
break;
}
@ -602,36 +684,6 @@ int mca_btl_openib_endpoint_send(
return rc;
}
/*
* Progress send frags, any frags that have been queued up before the connection
* was established need to be progressed.
*/
void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t* endpoint)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t *frag;
mca_btl_openib_module_t* openib_btl;
/*Check if endpoint is connected */
if(endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
return;
}
/* While there are frags in the list,
* process them */
while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
frag = (mca_btl_openib_frag_t *) frag_item;
openib_btl = endpoint->endpoint_btl;
/* We need to post this one */
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
BTL_ERROR(("Error posting send"));
}
}
/*
* Complete connection to endpoint.
@ -649,9 +701,9 @@ int mca_btl_openib_endpoint_connect(
endpoint->lcl_qp_high,
endpoint->lcl_qp_attr_high,
endpoint->lcl_psn_high,
endpoint->rem_qp_num_high,
endpoint->rem_psn_high,
endpoint->rem_lid,
endpoint->rem_info.rem_qp_num_high,
endpoint->rem_info.rem_psn_high,
endpoint->rem_info.rem_lid,
openib_btl->port_num
);
@ -665,9 +717,9 @@ int mca_btl_openib_endpoint_connect(
endpoint->lcl_qp_low,
endpoint->lcl_qp_attr_low,
endpoint->lcl_psn_low,
endpoint->rem_qp_num_low,
endpoint->rem_psn_low,
endpoint->rem_lid,
endpoint->rem_info.rem_qp_num_low,
endpoint->rem_info.rem_psn_low,
endpoint->rem_info.rem_lid,
openib_btl->port_num
);

Просмотреть файл

@ -33,6 +33,15 @@ extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t);
struct mca_btl_openib_frag_t;
struct mca_btl_openib_port_info_t {
uint16_t subnet;
};
typedef struct mca_btl_openib_port_info_t mca_btl_openib_port_info_t;
/**
* State of IB endpoint connection.
*/
@ -44,6 +53,9 @@ typedef enum {
/* Waiting for ack from endpoint */
MCA_BTL_IB_CONNECT_ACK,
/*Waiting for final connection ACK from endpoint */
MCA_BTL_IB_WAITING_ACK,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
@ -58,6 +70,29 @@ typedef enum {
MCA_BTL_IB_FAILED
} mca_btl_openib_endpoint_state_t;
struct mca_btl_openib_rem_info_t {
uint32_t rem_qp_num_high;
uint32_t rem_qp_num_low;
/* Remote QP number (Low and High priority) */
uint16_t rem_lid;
/* Local identifier of the remote process */
uint32_t rem_psn_high;
uint32_t rem_psn_low;
/* Remote processes port sequence number (Low and High) */
uint16_t rem_subnet;
/* subnet of remote process */
};
typedef struct mca_btl_openib_rem_info_t mca_btl_openib_rem_info_t;
/**
* An abstraction that represents a connection to a endpoint process.
* An instance of mca_btl_base_endpoint_t is associated w/ each process
@ -92,17 +127,7 @@ struct mca_btl_base_endpoint_t {
opal_list_t pending_send_frags;
/**< list of pending send frags for this endpotint */
uint32_t rem_qp_num_high;
uint32_t rem_qp_num_low;
/* Remote QP number (Low and High priority) */
uint16_t rem_lid;
/* Local identifier of the remote process */
uint32_t rem_psn_high;
uint32_t rem_psn_low;
/* Remote processes port sequence number (Low and High) */
mca_btl_openib_rem_info_t rem_info;
uint32_t lcl_psn_high;
uint32_t lcl_psn_low;
@ -119,7 +144,7 @@ struct mca_btl_base_endpoint_t {
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
uint16_t subnet; /**< subnet of this endpoint*/
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -130,8 +155,6 @@ int mca_btl_openib_endpoint_connect(mca_btl_base_endpoint_t*);
void mca_btl_openib_post_recv(void);
void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t*);
#define MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(post_rr_high_endpoint, \
post_rr_high_additional) \

Просмотреть файл

@ -61,7 +61,7 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->wr_desc.sr_desc.wr_id = (uint64_t) frag;
frag->wr_desc.sr_desc.wr_id = (unsigned long) frag;
frag->wr_desc.sr_desc.sg_list = &frag->sg_entry;
frag->wr_desc.sr_desc.num_sge = 1;
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
@ -78,7 +78,7 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->wr_desc.rr_desc.wr_id = (uint64_t) frag;
frag->wr_desc.rr_desc.wr_id = (unsigned long) frag;
frag->wr_desc.rr_desc.sg_list = &frag->sg_entry;
frag->wr_desc.rr_desc.num_sge = 1;
frag->wr_desc.rr_desc.next = NULL;

Просмотреть файл

@ -32,7 +32,7 @@ OBJ_CLASS_INSTANCE(mca_btl_openib_proc_t,
void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_addr_count = 0;
proc->proc_port_count = 0;
proc->proc_endpoints = 0;
proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
@ -95,7 +95,9 @@ static mca_btl_openib_proc_t* mca_btl_openib_proc_lookup_ompi(ompi_proc_t* ompi_
mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
{
mca_btl_openib_proc_t* module_proc = NULL;
size_t size;
int rc;
/* Check if we have already created a IB proc
* structure for this ompi process */
module_proc = mca_btl_openib_proc_lookup_ompi(ompi_proc);
@ -116,19 +118,42 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
* size) to represent the proc */
module_proc->proc_guid = ompi_proc->proc_name;
/* IB module doesn't have addresses exported at
* initialization, so the addr_count is set to one. */
module_proc->proc_addr_count = 1;
/* query for the peer address info */
rc = mca_pml_base_modex_recv(
&mca_btl_openib_component.super.btl_version,
ompi_proc,
(void*)&module_proc->proc_ports,
&size
);
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] mca_pml_base_modex_recv failed for peer [%d,%d,%d]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}
if((size % sizeof(mca_btl_openib_port_info_t)) != 0) {
opal_output(0, "[%s:%d] invalid module address for peer [%d,%d,%d]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}
/* XXX: Right now, there can be only 1 peer associated
* with a proc. Needs a little bit change in
* mca_btl_openib_proc_t to allow on demand increasing of
* number of endpoints for this proc */
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(module_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*));
module_proc->proc_port_count = size/sizeof(mca_btl_openib_port_info_t);
if (0 == module_proc->proc_port_count) {
module_proc->proc_endpoints = NULL;
} else {
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == module_proc->proc_endpoints) {
OBJ_RELEASE(module_proc);
return NULL;

Просмотреть файл

@ -43,9 +43,10 @@ struct mca_btl_openib_proc_t {
orte_process_name_t proc_guid;
/**< globally unique identifier for the process */
size_t proc_addr_count;
/**< number of addresses published by endpoint */
struct mca_btl_openib_port_info_t* proc_ports;
size_t proc_port_count;
/**< number of ports published by endpoint */
struct mca_btl_base_endpoint_t **proc_endpoints;
/**< array of endpoints that have been created to access this proc */

Просмотреть файл

@ -66,8 +66,8 @@ void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata)
for(i = 0; i < cnt; i++) {
reg = (mca_mpool_base_registration_t*)ompi_pointer_array_get_item(&regs, i);
if(base_addr < (void*) reg->bound) {
base_addr = down_align_addr( reg->bound, mca_mpool_base_page_size_log );
if(base_addr < (void*) ((unsigned long) reg->bound - mca_mpool_base_page_size + 1)) {
base_addr = (reg->bound - mca_mpool_base_page_size + 1);
}
if(reg->flags & MCA_MPOOL_FLAGS_CACHE) {
assert(reg->ref_count <= 3);

Просмотреть файл

@ -90,7 +90,7 @@ int mca_mpool_gm_register(
reg->mpool = mpool;
reg->base = down_align_addr(addr, mca_mpool_base_page_size_log);
reg->flags = flags;
reg->bound = up_align_addr(addr + size -1
reg->bound = up_align_addr((void*) ((unsigned long) addr + size -1)
, mca_mpool_base_page_size_log);

Просмотреть файл

@ -96,13 +96,13 @@ size_t mca_pml_ob1_rdma_btls(
unsigned char* new_bound = reg->bound > (base + size - 1) ? reg->bound : (base + size - 1);
size_t new_len = new_bound - new_base + 1;
/* printf("re-re5Bg 2: base %p size %d new_base %p new_len %d\n", base, size, new_base, new_len); */
/* printf("re-reg 2: base %p size %d new_base %p new_len %d\n", base, size, new_base, new_len); */
assert(new_len >= size);
reg_temp = *reg;
btl_mpool->mpool_deregister(btl_mpool, reg);
btl_mpool->mpool_register(btl_mpool,
new_base,
new_len,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&reg);
@ -268,8 +268,8 @@ mca_mpool_base_registration_t* mca_pml_ob1_rdma_registration(
btl_mpool->mpool_deregister(btl_mpool, largest);
assert(new_len >= size);
btl_mpool->mpool_register(btl_mpool,
new_base,
new_len,
base,
size,
MCA_MPOOL_FLAGS_CACHE,
&fit);
assert(fit->ref_count >= 3);

Просмотреть файл

@ -45,11 +45,11 @@ int mca_rcache_rb_mru_insert(
* the tree and mru list. memory will be deregistered when
* the reference count goes to zero.
*/
/* old_reg = (mca_mpool_base_registration_t*) */
/* opal_list_get_first(&rcache->mru_list); */
/* old_reg->mpool->mpool_retain(old_reg->mpool, old_reg); */
/* old_reg->mpool->mpool_deregister(old_reg->mpool, old_reg); */
old_reg = (mca_mpool_base_registration_t*)
opal_list_get_first(&rcache->mru_list);
old_reg->mpool->mpool_retain(old_reg->mpool, old_reg);
old_reg->mpool->mpool_deregister(old_reg->mpool, old_reg);
}
opal_list_append(&rcache->mru_list,(opal_list_item_t*) reg);
return OMPI_SUCCESS;