Handle different subnets correctly and multiple nic endpoint negotiation
This is somewhat limited currently for expample, if you have 3 ports on Node A and 5 ports on Node B then the peers will use 3 ports to communicate with each other. This is on a subnet basis, so for any pair of nodes we take the intersection of the available ports within a subnet. We use subnets to determine reachability for lazy connection establishment. So if Node A and Node B each have two HCA's (on seperate networks) then the subnet's must be distinct, otherwise we will try to wire up HCA's on seperate networks. This commit was SVN r12978.
Этот коммит содержится в:
родитель
7cac26d240
Коммит
f12bbe0591
@ -38,6 +38,8 @@
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
{
|
||||
&mca_btl_openib_component.super,
|
||||
@ -84,55 +86,91 @@ int mca_btl_openib_add_procs(
|
||||
ompi_bitmap_t* reachable)
|
||||
{
|
||||
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
|
||||
int i, rc;
|
||||
|
||||
int i,j, rc;
|
||||
int remote_subnets;
|
||||
int local_subnets;
|
||||
int btl_rank;
|
||||
for(i = 0; i < (int) nprocs; i++) {
|
||||
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
mca_btl_openib_proc_t* ib_proc;
|
||||
mca_btl_base_endpoint_t* ib_peer;
|
||||
|
||||
mca_btl_base_endpoint_t* endpoint;
|
||||
|
||||
|
||||
if(NULL == (ib_proc = mca_btl_openib_proc_create(ompi_proc))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to make sure that the peer has at least as many interface
|
||||
* addresses exported as we are trying to use. If not, then
|
||||
* don't bind this PTL instance to the proc.
|
||||
*/
|
||||
|
||||
remote_subnets = 0;
|
||||
/* check if the remote proc has a reachable subnet first */
|
||||
BTL_VERBOSE(("got %d port_infos \n", ib_proc->proc_port_count));
|
||||
for(j = 0; j < (int) ib_proc->proc_port_count; j++){
|
||||
BTL_VERBOSE(("got a subnet %016x\n",
|
||||
ib_proc->proc_ports[j].subnet));
|
||||
if(ib_proc->proc_ports[j].subnet ==
|
||||
openib_btl->port_info.subnet) {
|
||||
BTL_VERBOSE(("Got a matching subnet!\n"));
|
||||
remote_subnets++;
|
||||
}
|
||||
}
|
||||
if(!remote_subnets) {
|
||||
/* no use trying to communicate with this endpointlater */
|
||||
BTL_VERBOSE(("No matching subnet was found, moving on.. \n"));
|
||||
continue;
|
||||
}
|
||||
|
||||
local_subnets = 0;
|
||||
for(j=0; j < mca_btl_openib_component.ib_num_btls; j++){
|
||||
if(mca_btl_openib_component.openib_btls[j].port_info.subnet
|
||||
== openib_btl->port_info.subnet) {
|
||||
local_subnets++;
|
||||
}
|
||||
if(openib_btl == &(mca_btl_openib_component.openib_btls[j])) {
|
||||
btl_rank = j;
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
num_endpoints = remote_subnets / local_subnets +
|
||||
(btl_rank < (remote_subnets / local_subnets)) ? 1:0;
|
||||
|
||||
#endif
|
||||
if(remote_subnets < local_subnets &&
|
||||
btl_rank >= remote_subnets) {
|
||||
BTL_VERBOSE(("Not enough remote subnets, moving on.. \n"));
|
||||
continue;
|
||||
|
||||
}
|
||||
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
|
||||
|
||||
/* The btl_proc datastructure is shared by all IB PTL
|
||||
* instances that are trying to reach this destination.
|
||||
* Cache the peer instance on the btl_proc.
|
||||
*/
|
||||
ib_peer = OBJ_NEW(mca_btl_openib_endpoint_t);
|
||||
if(NULL == ib_peer) {
|
||||
endpoint = OBJ_NEW(mca_btl_openib_endpoint_t);
|
||||
if(NULL == endpoint) {
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ib_peer->endpoint_btl = openib_btl;
|
||||
ib_peer->subnet = openib_btl->port_info.subnet;
|
||||
rc = mca_btl_openib_proc_insert(ib_proc, ib_peer);
|
||||
|
||||
endpoint->endpoint_btl = openib_btl;
|
||||
endpoint->subnet = openib_btl->port_info.subnet;
|
||||
rc = mca_btl_openib_proc_insert(ib_proc, endpoint);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OBJ_RELEASE(ib_peer);
|
||||
OBJ_RELEASE(endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
orte_pointer_array_add((orte_std_cntr_t*)&ib_peer->index,
|
||||
openib_btl->endpoints, (void*)ib_peer);
|
||||
|
||||
|
||||
orte_pointer_array_add((orte_std_cntr_t*)&endpoint->index,
|
||||
openib_btl->endpoints, (void*)endpoint);
|
||||
ompi_bitmap_set_bit(reachable, i);
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
peers[i] = ib_peer;
|
||||
|
||||
peers[i] = endpoint;
|
||||
}
|
||||
|
||||
return mca_btl_openib_size_queues(openib_btl, nprocs);
|
||||
|
||||
|
||||
}
|
||||
|
||||
int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs)
|
||||
|
@ -284,7 +284,8 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
|
||||
|
||||
ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid);
|
||||
subnet = ntoh64(gid.global.subnet_prefix);
|
||||
|
||||
BTL_VERBOSE((0, "my subnet is %016x\n", subnet));
|
||||
|
||||
if(mca_btl_openib_component.ib_num_btls > 0 &&
|
||||
IB_DEFAULT_GID_PREFIX == subnet &&
|
||||
mca_btl_openib_component.warn_default_gid_prefix) {
|
||||
|
@ -572,7 +572,7 @@ static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoin
|
||||
|
||||
static void mca_btl_openib_endpoint_recv(
|
||||
int status,
|
||||
orte_process_name_t* endpoint,
|
||||
orte_process_name_t* process_name,
|
||||
orte_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
@ -628,31 +628,6 @@ static void mca_btl_openib_endpoint_recv(
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
#if 0
|
||||
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->r_key, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_base, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_size, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_cnt, &cnt, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
|
||||
rem_info.rem_qp_num_hp,
|
||||
@ -665,7 +640,7 @@ static void mca_btl_openib_endpoint_recv(
|
||||
opal_list_get_end(&mca_btl_openib_component.ib_procs);
|
||||
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
|
||||
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == ORTE_EQUAL) {
|
||||
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, process_name) == ORTE_EQUAL) {
|
||||
bool found = false;
|
||||
|
||||
/* Try to get the endpoint instance of this proc */
|
||||
@ -695,19 +670,7 @@ static void mca_btl_openib_endpoint_recv(
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* try finding an open port, even if subnets
|
||||
don't match
|
||||
*/
|
||||
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
|
||||
mca_btl_openib_port_info_t port_info;
|
||||
port_info = ib_proc->proc_ports[i];
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(!ib_endpoint->rem_info.rem_lid) {
|
||||
/* found an unused end-point */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(!found) {
|
||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||
|
@ -145,10 +145,10 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* TODO - Endian Ordering fixups for the subnet and such.. just call hton, ntoh
|
||||
always use NBO */
|
||||
module_proc->proc_port_count = size/sizeof(mca_btl_openib_port_info_t);
|
||||
|
||||
|
||||
|
||||
if (0 == module_proc->proc_port_count) {
|
||||
module_proc->proc_endpoints = NULL;
|
||||
} else {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user