1
1

btl/openib: add support for dynamic add_procs

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-05-07 12:47:07 -06:00
родитель 40067f7ec4
Коммит 2041aac4e4
3 изменённых файлов: 79 добавлений и 25 удалений

Просмотреть файл

@ -871,6 +871,7 @@ int mca_btl_openib_add_procs(
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
struct opal_proc_t* proc = procs[i];
mca_btl_openib_proc_t* ib_proc;
bool found_existing = false;
int remote_matching_port;
opal_output(-1, "add procs: adding proc %d", i);
@ -898,6 +899,24 @@ int mca_btl_openib_add_procs(
continue;
}
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) {
endpoint = ib_proc->proc_endpoints[j];
if (endpoint->endpoint_btl == openib_btl) {
found_existing = true;
break;
}
}
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
if (found_existing) {
if (reachable) {
opal_bitmap_set_bit(reachable, i);
}
peers[i] = endpoint;
continue;
}
/* check if the remote proc has any ports that:
- on the same subnet as the local proc, and
- on that subnet, has a CPC in common with the local proc
@ -1048,6 +1067,37 @@ int mca_btl_openib_add_procs(
return OPAL_SUCCESS;
}
struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, struct opal_proc_t *proc)
{
mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl;
mca_btl_base_endpoint_t *endpoint;
mca_btl_openib_proc_t *ib_proc;
if (NULL == (ib_proc = mca_btl_openib_proc_create(proc))) {
/* if we don't have connection info for this process, it's
* okay because some other method might be able to reach it,
* so just mark it as unreachable by us */
return NULL;
}
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) {
endpoint = ib_proc->proc_endpoints[j];
if (endpoint->endpoint_btl == openib_btl) {
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return endpoint;
}
}
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
BTL_VERBOSE(("creating new endpoint for remote process {.jobid = 0x%x, .vpid = 0x%x}",
proc->proc_name.jobid, proc->proc_name.vpid));
endpoint = NULL;
(void) mca_btl_openib_add_procs (btl, 1, &proc, &endpoint, NULL);
return endpoint;
}
/*
* delete the proc as reachable from this btl module
*/

Просмотреть файл

@ -874,6 +874,18 @@ int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
/**
* Get an endpoint for a process
*
* @param btl (IN) BTL module
* @param proc (IN) opal process object
*
* This function will return an existing endpoint if one exists otherwise it will allocate
* a new endpoint and return it.
*/
struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl,
struct opal_proc_t *proc);
/**
* Get a transport type of btl.
*/

Просмотреть файл

@ -218,6 +218,7 @@ typedef struct udcm_msg_hdr {
union {
/* UDCM_MESSAGE_CONNECT */
struct msg_connect {
opal_process_name_t rem_name;
int32_t rem_ep_index;
uint8_t rem_port_num;
} req;
@ -1473,36 +1474,26 @@ static int udcm_rc_qp_create_all (mca_btl_base_endpoint_t *lcl_ep)
/* JMS: optimization target -- can we send something in private
data to find the proc directly instead of having to search
through *all* procs? */
static mca_btl_openib_endpoint_t *udcm_find_endpoint (opal_pointer_array_t *endpoints,
static mca_btl_openib_endpoint_t *udcm_find_endpoint (struct mca_btl_openib_module_t *btl,
uint32_t qp_num, uint16_t lid,
udcm_msg_hdr_t *msg_hdr)
{
uint8_t port_num;
int i;
mca_btl_base_endpoint_t *endpoint;
struct opal_proc_t *opal_proc;
port_num = msg_hdr->data.req.rem_port_num;
for (i = 0 ; i < opal_pointer_array_get_size (endpoints) ; ++i) {
mca_btl_openib_endpoint_t *endpoint;
modex_msg_t *msg;
endpoint = (mca_btl_openib_endpoint_t *)
opal_pointer_array_get_item (endpoints, i);
if (NULL == endpoint) {
continue;
}
msg = UDCM_ENDPOINT_REM_MODEX(endpoint);
if (msg->mm_qp_num == qp_num && msg->mm_port_num == port_num &&
msg->mm_lid == lid)
return endpoint;
opal_proc = opal_proc_for_name (msg_hdr->data.req.rem_name);
if (NULL == opal_proc) {
BTL_ERROR(("could not get proc associated with remote peer"));
return NULL;
}
BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d",
port_num, lid, msg_hdr->type));
endpoint = mca_btl_openib_get_ep (&btl->super, opal_proc);
if (NULL == endpoint) {
BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d",
msg_hdr->data.req.rem_port_num, lid, msg_hdr->type));
}
return NULL;
return endpoint;
}
static int udcm_endpoint_init_data (mca_btl_base_endpoint_t *lcl_ep)
@ -1678,6 +1669,7 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME;
for (i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
msg->data->qps[i].psn = htonl(lcl_ep->qps[i].qp->lcl_psn);
@ -1981,8 +1973,7 @@ static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m)
lcl_ep = message->hdr.lcl_ep;
if (NULL == lcl_ep) {
lcl_ep = udcm_find_endpoint (m->btl->device->endpoints, wc[i].src_qp,
wc[i].slid, &message->hdr);
lcl_ep = udcm_find_endpoint (m->btl, wc[i].src_qp, wc[i].slid, &message->hdr);
}
if (NULL == lcl_ep ) {
@ -2824,6 +2815,7 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_
msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME;
if (UDCM_MESSAGE_XCONNECT == msg_type) {
BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num,