btl/openib: add support for dynamic add_procs
Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
40067f7ec4
Коммит
2041aac4e4
@ -871,6 +871,7 @@ int mca_btl_openib_add_procs(
|
||||
for (i = 0, local_procs = 0 ; i < (int) nprocs; i++) {
|
||||
struct opal_proc_t* proc = procs[i];
|
||||
mca_btl_openib_proc_t* ib_proc;
|
||||
bool found_existing = false;
|
||||
int remote_matching_port;
|
||||
|
||||
opal_output(-1, "add procs: adding proc %d", i);
|
||||
@ -898,6 +899,24 @@ int mca_btl_openib_add_procs(
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
|
||||
for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) {
|
||||
endpoint = ib_proc->proc_endpoints[j];
|
||||
if (endpoint->endpoint_btl == openib_btl) {
|
||||
found_existing = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
|
||||
if (found_existing) {
|
||||
if (reachable) {
|
||||
opal_bitmap_set_bit(reachable, i);
|
||||
}
|
||||
peers[i] = endpoint;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check if the remote proc has any ports that:
|
||||
- on the same subnet as the local proc, and
|
||||
- on that subnet, has a CPC in common with the local proc
|
||||
@ -1048,6 +1067,37 @@ int mca_btl_openib_add_procs(
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, struct opal_proc_t *proc)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl;
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_openib_proc_t *ib_proc;
|
||||
|
||||
if (NULL == (ib_proc = mca_btl_openib_proc_create(proc))) {
|
||||
/* if we don't have connection info for this process, it's
|
||||
* okay because some other method might be able to reach it,
|
||||
* so just mark it as unreachable by us */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
|
||||
for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) {
|
||||
endpoint = ib_proc->proc_endpoints[j];
|
||||
if (endpoint->endpoint_btl == openib_btl) {
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
return endpoint;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
|
||||
BTL_VERBOSE(("creating new endpoint for remote process {.jobid = 0x%x, .vpid = 0x%x}",
|
||||
proc->proc_name.jobid, proc->proc_name.vpid));
|
||||
|
||||
endpoint = NULL;
|
||||
(void) mca_btl_openib_add_procs (btl, 1, &proc, &endpoint, NULL);
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
/*
|
||||
* delete the proc as reachable from this btl module
|
||||
*/
|
||||
|
@ -874,6 +874,18 @@ int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
|
||||
|
||||
const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
|
||||
|
||||
/**
|
||||
* Get an endpoint for a process
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param proc (IN) opal process object
|
||||
*
|
||||
* This function will return an existing endpoint if one exists otherwise it will allocate
|
||||
* a new endpoint and return it.
|
||||
*/
|
||||
struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl,
|
||||
struct opal_proc_t *proc);
|
||||
|
||||
/**
|
||||
* Get a transport type of btl.
|
||||
*/
|
||||
|
@ -218,6 +218,7 @@ typedef struct udcm_msg_hdr {
|
||||
union {
|
||||
/* UDCM_MESSAGE_CONNECT */
|
||||
struct msg_connect {
|
||||
opal_process_name_t rem_name;
|
||||
int32_t rem_ep_index;
|
||||
uint8_t rem_port_num;
|
||||
} req;
|
||||
@ -1473,36 +1474,26 @@ static int udcm_rc_qp_create_all (mca_btl_base_endpoint_t *lcl_ep)
|
||||
/* JMS: optimization target -- can we send something in private
|
||||
data to find the proc directly instead of having to search
|
||||
through *all* procs? */
|
||||
static mca_btl_openib_endpoint_t *udcm_find_endpoint (opal_pointer_array_t *endpoints,
|
||||
static mca_btl_openib_endpoint_t *udcm_find_endpoint (struct mca_btl_openib_module_t *btl,
|
||||
uint32_t qp_num, uint16_t lid,
|
||||
udcm_msg_hdr_t *msg_hdr)
|
||||
{
|
||||
uint8_t port_num;
|
||||
int i;
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
struct opal_proc_t *opal_proc;
|
||||
|
||||
port_num = msg_hdr->data.req.rem_port_num;
|
||||
|
||||
for (i = 0 ; i < opal_pointer_array_get_size (endpoints) ; ++i) {
|
||||
mca_btl_openib_endpoint_t *endpoint;
|
||||
modex_msg_t *msg;
|
||||
|
||||
endpoint = (mca_btl_openib_endpoint_t *)
|
||||
opal_pointer_array_get_item (endpoints, i);
|
||||
if (NULL == endpoint) {
|
||||
continue;
|
||||
}
|
||||
|
||||
msg = UDCM_ENDPOINT_REM_MODEX(endpoint);
|
||||
|
||||
if (msg->mm_qp_num == qp_num && msg->mm_port_num == port_num &&
|
||||
msg->mm_lid == lid)
|
||||
return endpoint;
|
||||
opal_proc = opal_proc_for_name (msg_hdr->data.req.rem_name);
|
||||
if (NULL == opal_proc) {
|
||||
BTL_ERROR(("could not get proc associated with remote peer"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d",
|
||||
port_num, lid, msg_hdr->type));
|
||||
endpoint = mca_btl_openib_get_ep (&btl->super, opal_proc);
|
||||
if (NULL == endpoint) {
|
||||
BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d",
|
||||
msg_hdr->data.req.rem_port_num, lid, msg_hdr->type));
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
static int udcm_endpoint_init_data (mca_btl_base_endpoint_t *lcl_ep)
|
||||
@ -1678,6 +1669,7 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
|
||||
|
||||
msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
|
||||
msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
|
||||
msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME;
|
||||
|
||||
for (i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) {
|
||||
msg->data->qps[i].psn = htonl(lcl_ep->qps[i].qp->lcl_psn);
|
||||
@ -1981,8 +1973,7 @@ static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m)
|
||||
lcl_ep = message->hdr.lcl_ep;
|
||||
|
||||
if (NULL == lcl_ep) {
|
||||
lcl_ep = udcm_find_endpoint (m->btl->device->endpoints, wc[i].src_qp,
|
||||
wc[i].slid, &message->hdr);
|
||||
lcl_ep = udcm_find_endpoint (m->btl, wc[i].src_qp, wc[i].slid, &message->hdr);
|
||||
}
|
||||
|
||||
if (NULL == lcl_ep ) {
|
||||
@ -2824,6 +2815,7 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_
|
||||
|
||||
msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index);
|
||||
msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num;
|
||||
msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME;
|
||||
|
||||
if (UDCM_MESSAGE_XCONNECT == msg_type) {
|
||||
BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user