1
1

Handle different subnets correctly and multiple nic endpoint negotiation

This is somewhat limited currently for expample,  if you have 3 ports on Node A and 5 ports
on Node B then the peers will use 3 ports to communicate with each other. 
This is on a subnet basis, so for any pair of nodes we take the
intersection of the available ports within a subnet.

We use subnets to determine reachability for lazy connection establishment. So
if Node A and Node B each have two HCA's (on seperate networks) then the
subnet's must be distinct, otherwise we will try to wire up HCA's on seperate
networks.  

This commit was SVN r12978.
Этот коммит содержится в:
Galen Shipman 2007-01-03 22:35:41 +00:00
родитель 7cac26d240
Коммит f12bbe0591
4 изменённых файлов: 69 добавлений и 67 удалений

Просмотреть файл

@ -38,6 +38,8 @@
#include <errno.h>
#include <string.h>
#include <math.h>
#include <inttypes.h>
mca_btl_openib_module_t mca_btl_openib_module = {
{
&mca_btl_openib_component.super,
@ -84,55 +86,91 @@ int mca_btl_openib_add_procs(
ompi_bitmap_t* reachable)
{
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl;
int i, rc;
int i,j, rc;
int remote_subnets;
int local_subnets;
int btl_rank;
for(i = 0; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_openib_proc_t* ib_proc;
mca_btl_base_endpoint_t* ib_peer;
mca_btl_base_endpoint_t* endpoint;
if(NULL == (ib_proc = mca_btl_openib_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this PTL instance to the proc.
*/
remote_subnets = 0;
/* check if the remote proc has a reachable subnet first */
BTL_VERBOSE(("got %d port_infos \n", ib_proc->proc_port_count));
for(j = 0; j < (int) ib_proc->proc_port_count; j++){
BTL_VERBOSE(("got a subnet %016x\n",
ib_proc->proc_ports[j].subnet));
if(ib_proc->proc_ports[j].subnet ==
openib_btl->port_info.subnet) {
BTL_VERBOSE(("Got a matching subnet!\n"));
remote_subnets++;
}
}
if(!remote_subnets) {
/* no use trying to communicate with this endpointlater */
BTL_VERBOSE(("No matching subnet was found, moving on.. \n"));
continue;
}
local_subnets = 0;
for(j=0; j < mca_btl_openib_component.ib_num_btls; j++){
if(mca_btl_openib_component.openib_btls[j].port_info.subnet
== openib_btl->port_info.subnet) {
local_subnets++;
}
if(openib_btl == &(mca_btl_openib_component.openib_btls[j])) {
btl_rank = j;
}
}
#if 0
num_endpoints = remote_subnets / local_subnets +
(btl_rank < (remote_subnets / local_subnets)) ? 1:0;
#endif
if(remote_subnets < local_subnets &&
btl_rank >= remote_subnets) {
BTL_VERBOSE(("Not enough remote subnets, moving on.. \n"));
continue;
}
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
/* The btl_proc datastructure is shared by all IB PTL
* instances that are trying to reach this destination.
* Cache the peer instance on the btl_proc.
*/
ib_peer = OBJ_NEW(mca_btl_openib_endpoint_t);
if(NULL == ib_peer) {
endpoint = OBJ_NEW(mca_btl_openib_endpoint_t);
if(NULL == endpoint) {
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
ib_peer->endpoint_btl = openib_btl;
ib_peer->subnet = openib_btl->port_info.subnet;
rc = mca_btl_openib_proc_insert(ib_proc, ib_peer);
endpoint->endpoint_btl = openib_btl;
endpoint->subnet = openib_btl->port_info.subnet;
rc = mca_btl_openib_proc_insert(ib_proc, endpoint);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(ib_peer);
OBJ_RELEASE(endpoint);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
continue;
}
orte_pointer_array_add((orte_std_cntr_t*)&ib_peer->index,
openib_btl->endpoints, (void*)ib_peer);
orte_pointer_array_add((orte_std_cntr_t*)&endpoint->index,
openib_btl->endpoints, (void*)endpoint);
ompi_bitmap_set_bit(reachable, i);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
peers[i] = ib_peer;
peers[i] = endpoint;
}
return mca_btl_openib_size_queues(openib_btl, nprocs);
}
int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs)

Просмотреть файл

@ -284,7 +284,8 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid);
subnet = ntoh64(gid.global.subnet_prefix);
BTL_VERBOSE((0, "my subnet is %016x\n", subnet));
if(mca_btl_openib_component.ib_num_btls > 0 &&
IB_DEFAULT_GID_PREFIX == subnet &&
mca_btl_openib_component.warn_default_gid_prefix) {

Просмотреть файл

@ -572,7 +572,7 @@ static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoin
static void mca_btl_openib_endpoint_recv(
int status,
orte_process_name_t* endpoint,
orte_process_name_t* process_name,
orte_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
@ -628,31 +628,6 @@ static void mca_btl_openib_endpoint_recv(
ORTE_ERROR_LOG(rc);
return;
}
#if 0
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->r_key, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_base, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_size, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dss.unpack(buffer, &ib_endpoint->rdma_buf->rem_cnt, &cnt, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
#endif
BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
rem_info.rem_qp_num_hp,
@ -665,7 +640,7 @@ static void mca_btl_openib_endpoint_recv(
opal_list_get_end(&mca_btl_openib_component.ib_procs);
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, endpoint) == ORTE_EQUAL) {
if(orte_ns.compare_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, process_name) == ORTE_EQUAL) {
bool found = false;
/* Try to get the endpoint instance of this proc */
@ -695,19 +670,7 @@ static void mca_btl_openib_endpoint_recv(
break;
}
}
/* try finding an open port, even if subnets
don't match
*/
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
mca_btl_openib_port_info_t port_info;
port_info = ib_proc->proc_ports[i];
ib_endpoint = ib_proc->proc_endpoints[i];
if(!ib_endpoint->rem_info.rem_lid) {
/* found an unused end-point */
found = true;
break;
}
}
if(!found) {
BTL_ERROR(("can't find suitable endpoint for this peer\n"));

Просмотреть файл

@ -145,10 +145,10 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
return NULL;
}
/* TODO - Endian Ordering fixups for the subnet and such.. just call hton, ntoh
always use NBO */
module_proc->proc_port_count = size/sizeof(mca_btl_openib_port_info_t);
if (0 == module_proc->proc_port_count) {
module_proc->proc_endpoints = NULL;
} else {