usnic: refactor usnic_add_procs() into 2 distinct parts
1: find/create procs, and create associated endpoint for each 2: resolve peer addresses The 2nd part is done as a separate loop so that the address lookups can be parallelized. The overall result is to split usnic_add_procs() into two smaller, simpler parts. cmr=v1.8.2:ticket=trac:4734 This commit was SVN r32062. The following Trac tickets were found above: Ticket 4734 --> https://svn.open-mpi.org/trac/ompi/ticket/4734
Этот коммит содержится в:
родитель
1ea7bad5a0
Коммит
011db6974e
@ -81,144 +81,18 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
|
||||
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
|
||||
}
|
||||
|
||||
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
|
||||
* therefore take some time to complete. Hence, it will return 1 of 3
|
||||
* things:
|
||||
*
|
||||
* 1. a valid new ah
|
||||
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
|
||||
* 3. NULL and errno != EAGAIN (fatal error)
|
||||
*
|
||||
* Since ibv_create_ah() is therefore effectively non-blocking, we
|
||||
* gang all the endpoint ah creations here in this loop so that we can
|
||||
* get some parallelization of ARP resolution.
|
||||
*/
|
||||
static int create_ahs(size_t array_len, size_t num_endpoints,
|
||||
struct mca_btl_base_endpoint_t** endpoints,
|
||||
ompi_btl_usnic_module_t *module)
|
||||
{
|
||||
size_t i;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
size_t num_ah_left;
|
||||
time_t ts_last_created;
|
||||
|
||||
/* memset the ah_attr to both silence valgrind warnings (since the
|
||||
attr struct ends up getting written down an fd to the kernel)
|
||||
and actually zero out all the fields that we don't care about /
|
||||
want to be logically false. */
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.port_num = 1;
|
||||
|
||||
ts_last_created = time(NULL);
|
||||
num_ah_left = num_endpoints;
|
||||
|
||||
/* Mark all endpoints as unreachable (this should already be done,
|
||||
but just be defensive) */
|
||||
for (i = 0; i < array_len; i++) {
|
||||
if (NULL != endpoints[i]) {
|
||||
endpoints[i]->endpoint_remote_ah = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
while (num_ah_left > 0) {
|
||||
for (i = 0; i < array_len; i++) {
|
||||
if (NULL != endpoints[i] &&
|
||||
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
|
||||
endpoints[i]->endpoint_remote_ah =
|
||||
ibv_create_ah(module->pd, &ah_attr);
|
||||
|
||||
/* Got a successfully-created AH */
|
||||
if (NULL != endpoints[i]->endpoint_remote_ah) {
|
||||
ts_last_created = time(NULL);
|
||||
--num_ah_left;
|
||||
}
|
||||
|
||||
/* Got some kind of address failure. This usually
|
||||
means that we couldn't find a route to that peer
|
||||
(e.g., the networking is hosed between us). So
|
||||
just mark that we can't reach this peer, and print
|
||||
a pretty warning. */
|
||||
else if (EADDRNOTAVAIL == errno) {
|
||||
OBJ_RELEASE(endpoints[i]);
|
||||
endpoints[i] = NULL;
|
||||
--num_ah_left;
|
||||
|
||||
/* Print a pretty warning */
|
||||
if (mca_btl_usnic_component.show_route_failures) {
|
||||
char local[IPV4STRADDRLEN], remote[IPV4STRADDRLEN];
|
||||
ompi_btl_usnic_snprintf_ipv4_addr(local, sizeof(local),
|
||||
module->local_addr.ipv4_addr,
|
||||
module->local_addr.cidrmask);
|
||||
ompi_btl_usnic_snprintf_ipv4_addr(remote, sizeof(remote),
|
||||
endpoints[i]->endpoint_remote_addr.ipv4_addr,
|
||||
endpoints[i]->endpoint_remote_addr.cidrmask);
|
||||
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "create_ah failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
local,
|
||||
module->if_name,
|
||||
ibv_get_device_name(module->device),
|
||||
endpoints[i]->endpoint_proc->proc_ompi->proc_hostname,
|
||||
remote);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Got some other kind of error -- give up on this
|
||||
interface. */
|
||||
else if (EAGAIN != errno) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->if_name,
|
||||
"ibv_create_ah()", __FILE__, __LINE__,
|
||||
"Failed to create an address handle");
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Has it been too long since our last AH creation (ARP
|
||||
resolution)? If so, we're probably never going to finish,
|
||||
so just mark all remaining endpoints as unreachable and
|
||||
bail. */
|
||||
if (num_ah_left > 0 &&
|
||||
time(NULL) > (ts_last_created + mca_btl_usnic_component.arp_timeout)) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv_create_ah timeout",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_name,
|
||||
mca_btl_usnic_component.arp_timeout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " address handles",
|
||||
(num_endpoints - num_ah_left));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add procs to this BTL module, receiving endpoint information from
|
||||
* the modex.
|
||||
* Loop over all procs sent to us in add_procs and see if we want to
|
||||
* add a proc/endpoint for them.
|
||||
*/
|
||||
static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
static int add_procs_create_endpoints(ompi_btl_usnic_module_t *module,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **ompi_procs,
|
||||
struct mca_btl_base_endpoint_t** endpoints,
|
||||
opal_bitmap_t* reachable)
|
||||
ompi_proc_t **ompi_procs,
|
||||
mca_btl_base_endpoint_t **endpoints)
|
||||
{
|
||||
ompi_btl_usnic_module_t* module = (ompi_btl_usnic_module_t*) base_module;
|
||||
ompi_proc_t* my_proc;
|
||||
size_t i, count;
|
||||
int rc;
|
||||
ompi_proc_t* my_proc;
|
||||
size_t num_created = 0;
|
||||
|
||||
/* get pointer to my proc structure */
|
||||
my_proc = ompi_proc_local();
|
||||
@ -226,8 +100,8 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
count = 0;
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
/* Loop over the procs we were given */
|
||||
for (size_t i = 0; i < nprocs; i++) {
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
ompi_btl_usnic_proc_t* usnic_proc;
|
||||
mca_btl_base_endpoint_t* usnic_endpoint;
|
||||
@ -271,86 +145,256 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Add to array of all procs (proc_match gave us a reference) */
|
||||
/* We like this new endpoint; save it */
|
||||
opal_pointer_array_add(&module->all_procs, usnic_proc);
|
||||
|
||||
union ibv_gid gid = usnic_endpoint->endpoint_remote_addr.gid;
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: new usnic peer: subnet = 0x%016" PRIx64 ", interface = 0x%016" PRIx64,
|
||||
ntoh64(usnic_endpoint->endpoint_remote_addr.gid.global.subnet_prefix),
|
||||
ntoh64(usnic_endpoint->endpoint_remote_addr.gid.global.interface_id));
|
||||
"btl:usnic: new usnic peer endpoint: subnet = 0x%016" PRIx64 ", interface = 0x%016" PRIx64,
|
||||
ntoh64(gid.global.subnet_prefix),
|
||||
ntoh64(gid.global.interface_id));
|
||||
|
||||
opal_bitmap_set_bit(reachable, i);
|
||||
endpoints[i] = usnic_endpoint;
|
||||
opal_output_verbose(15, USNIC_OUT,
|
||||
"btl:usnic: made %p endpoint", (void*) usnic_endpoint);
|
||||
count++;
|
||||
++num_created;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " endpoints", count);
|
||||
|
||||
/* Create address handles for all the newly-created endpoints */
|
||||
rc = create_ahs(nprocs, count, endpoints, module);
|
||||
|
||||
/* If we got non-SUCCESS back, destroy everything */
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
opal_output_verbose(15, USNIC_OUT,
|
||||
"btl:usnic: creating address handles failed on %s/%s, destroying all endpoints",
|
||||
ibv_get_device_name(module->device),
|
||||
module->if_name);
|
||||
|
||||
ompi_btl_usnic_endpoint_t *ep;
|
||||
opal_list_item_t *item, *next;
|
||||
OPAL_LIST_FOREACH_SAFE(item, next, &(module->all_endpoints), opal_list_item_t) {
|
||||
ep = container_of(item, ompi_btl_usnic_endpoint_t,
|
||||
endpoint_endpoint_li);
|
||||
if (NULL != ep->endpoint_remote_ah) {
|
||||
ibv_destroy_ah(ep->endpoint_remote_ah);
|
||||
ep->endpoint_remote_ah = NULL;
|
||||
}
|
||||
OBJ_RELEASE(ep);
|
||||
"btl:usnic: made %" PRIsize_t " endpoints",
|
||||
num_created);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nprocs; ++i) {
|
||||
endpoints[i] = NULL;
|
||||
opal_bitmap_clear_bit(reachable, i);
|
||||
}
|
||||
/*
|
||||
* Print a warning about how the remote peer was unreachable.
|
||||
*
|
||||
* This is a separate helper function simply because it's somewhat
|
||||
* bulky to put inline.
|
||||
*/
|
||||
static void add_procs_warn_ah_fail(ompi_btl_usnic_module_t *module,
|
||||
ompi_btl_usnic_endpoint_t *endpoint)
|
||||
{
|
||||
/* Only show the warning if it is enabled */
|
||||
if (!mca_btl_usnic_component.show_route_failures) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* If we got success, check for NULL ah values in the array */
|
||||
else {
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
if (NULL != endpoints[i] &&
|
||||
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||
ompi_btl_usnic_endpoint_t *ep = endpoints[i];
|
||||
|
||||
/* We have already done a show_help() with this same
|
||||
basic info in create_ahs(), but do an
|
||||
opal_output_verbose() here, because show_help()
|
||||
will only show the first problem -- not all of
|
||||
them. */
|
||||
char local[IPV4STRADDRLEN], remote[IPV4STRADDRLEN];
|
||||
ompi_btl_usnic_snprintf_ipv4_addr(local, sizeof(remote),
|
||||
ompi_btl_usnic_snprintf_ipv4_addr(local, sizeof(local),
|
||||
module->local_addr.ipv4_addr,
|
||||
module->local_addr.cidrmask);
|
||||
ompi_btl_usnic_snprintf_ipv4_addr(remote, sizeof(remote),
|
||||
ep->endpoint_remote_addr.ipv4_addr,
|
||||
ep->endpoint_remote_addr.cidrmask);
|
||||
endpoint->endpoint_remote_addr.ipv4_addr,
|
||||
endpoint->endpoint_remote_addr.cidrmask);
|
||||
|
||||
opal_output_verbose(15, USNIC_OUT,
|
||||
"btl:usnic: %s/%s (%s) couldn't reach peer %s",
|
||||
ibv_get_device_name(module->device),
|
||||
module->if_name, local, remote);
|
||||
|
||||
opal_list_remove_item(&(module->all_endpoints),
|
||||
&(ep->endpoint_endpoint_li));
|
||||
OBJ_RELEASE(ep);
|
||||
endpoints[i] = NULL;
|
||||
|
||||
opal_bitmap_clear_bit(reachable, i);
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "create_ah failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
local,
|
||||
module->if_name,
|
||||
ibv_get_device_name(module->device),
|
||||
endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||
remote);
|
||||
}
|
||||
|
||||
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
|
||||
* therefore take some time to complete. Hence, it will return 1 of 4
|
||||
* things:
|
||||
*
|
||||
* 1. a valid new ah
|
||||
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
|
||||
* 3. NULL and errno == EADDRNOTAVAIL (unable to reach peer)
|
||||
* 4. NULL and errno != (EAGAIN or ADDRNOTAVAIL) (fatal error)
|
||||
*
|
||||
* Since ibv_create_ah() is therefore effectively non-blocking, we
|
||||
* gang all the endpoint ah creations here in this loop so that we can
|
||||
* get some parallelization of ARP resolution.
|
||||
*/
|
||||
static int add_procs_create_ahs(ompi_btl_usnic_module_t *module,
|
||||
size_t array_len,
|
||||
struct mca_btl_base_endpoint_t **endpoints)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
size_t i;
|
||||
size_t num_ah_left;
|
||||
time_t ts_last_created;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
|
||||
/* memset the ah_attr to both silence valgrind warnings (since the
|
||||
attr struct ends up getting written down an fd to the kernel)
|
||||
and actually zero out all the fields that we don't care about
|
||||
and want to be logically false. */
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.port_num = 1;
|
||||
|
||||
/* Mark all endpoints as unreachable (this should already be done,
|
||||
but just be defensive) */
|
||||
for (num_ah_left = i = 0; i < array_len; i++) {
|
||||
if (NULL != endpoints[i]) {
|
||||
endpoints[i]->endpoint_remote_ah = NULL;
|
||||
++num_ah_left;
|
||||
}
|
||||
}
|
||||
|
||||
ts_last_created = time(NULL);
|
||||
while (num_ah_left > 0) {
|
||||
for (i = 0; i < array_len; i++) {
|
||||
if (NULL != endpoints[i] &&
|
||||
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
|
||||
endpoints[i]->endpoint_remote_ah =
|
||||
ibv_create_ah(module->pd, &ah_attr);
|
||||
|
||||
/* Got a successfully-created AH */
|
||||
if (NULL != endpoints[i]->endpoint_remote_ah) {
|
||||
ts_last_created = time(NULL);
|
||||
--num_ah_left;
|
||||
}
|
||||
|
||||
/* Got some kind of address failure. This usually
|
||||
means that we couldn't find a route to that peer
|
||||
(e.g., the networking is hosed between us). So
|
||||
just mark that we can't reach this peer, and print
|
||||
a pretty warning. */
|
||||
else if (EADDRNOTAVAIL == errno) {
|
||||
add_procs_warn_ah_fail(module, endpoints[i]);
|
||||
|
||||
OBJ_RELEASE(endpoints[i]);
|
||||
endpoints[i] = NULL;
|
||||
--num_ah_left;
|
||||
}
|
||||
|
||||
/* Got some other kind of error -- give up on this
|
||||
interface. */
|
||||
else if (EAGAIN != errno) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->if_name,
|
||||
"ibv_create_ah()", __FILE__, __LINE__,
|
||||
"Failed to create an address handle");
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Has it been too long since our last AH creation (ARP
|
||||
resolution)? If so, we're probably never going to finish,
|
||||
so just mark all remaining endpoints as unreachable and
|
||||
bail. */
|
||||
if (num_ah_left > 0 &&
|
||||
time(NULL) > (ts_last_created +
|
||||
mca_btl_usnic_component.arp_timeout)) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv_create_ah timeout",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_name,
|
||||
mca_btl_usnic_component.arp_timeout);
|
||||
break;
|
||||
}
|
||||
|
||||
/* If we still have addresses that aren't resolved yet, sleep
|
||||
a little to let kernel threads do some work behind the
|
||||
scenes */
|
||||
if (num_ah_left > 0) {
|
||||
usleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Look through the list:
|
||||
- If something went wrong above, free all endpoints.
|
||||
- If an otherwise-valid endpoint has no AH, that means we timed
|
||||
out trying to resolve it, so just release that endpoint. */
|
||||
size_t num_created = 0;
|
||||
for (i = 0; i < array_len; i++) {
|
||||
if (NULL != endpoints[i]) {
|
||||
if (OMPI_SUCCESS != ret ||
|
||||
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||
OBJ_RELEASE(endpoints[i]);
|
||||
endpoints[i] = NULL;
|
||||
} else {
|
||||
++num_created;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " address handles",
|
||||
num_created);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add procs to this BTL module, receiving endpoint information from
|
||||
* the modex. This is done in 2 phases:
|
||||
*
|
||||
* 1. Find (or create) the remote proc, and create the associated
|
||||
* endpoint.
|
||||
* 2. Resolve the address handles for all remote endpoints.
|
||||
*
|
||||
* The second part is a separate loop from the first part to allow the
|
||||
* address lookups to be done in parallel. This comes at a cost,
|
||||
* however: we may determine during the 2nd part that we should tear
|
||||
* down some or all the endpoints that we created in the 1st part.
|
||||
* For example, ibv_create_ah() may fail in a fatal way (i.e., we
|
||||
* should fail the entire add_procs()), or it may fail for one or more
|
||||
* peers (i.e., we should just mark those peers as unreachable and not
|
||||
* add a proc or endpoint for them).
|
||||
*/
|
||||
static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **ompi_procs,
|
||||
struct mca_btl_base_endpoint_t** endpoints,
|
||||
opal_bitmap_t* reachable)
|
||||
{
|
||||
ompi_btl_usnic_module_t* module = (ompi_btl_usnic_module_t*) base_module;
|
||||
int rc;
|
||||
|
||||
/* First, create endpoints (and procs, if they're not already
|
||||
created) for all the usnic-reachable procs we were given. */
|
||||
rc = add_procs_create_endpoints(module, nprocs, ompi_procs, endpoints);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Create address handles for all the newly-created endpoints */
|
||||
rc = add_procs_create_ahs(module, nprocs, endpoints);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Find all the endpoints with address handles and mark them as
|
||||
reachable */
|
||||
for (size_t i = 0; i < nprocs; ++i) {
|
||||
if (NULL != endpoints[i] &&
|
||||
NULL != endpoints[i]->endpoint_remote_ah) {
|
||||
opal_bitmap_set_bit(reachable, i);
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
fail:
|
||||
/* If we get here, it means something went terribly wrong. Scorch
|
||||
the earth: destroy all endpoints and say that nothing was
|
||||
reachable. */
|
||||
for (size_t i = 0; i < nprocs; ++i) {
|
||||
if (NULL != endpoints[i]) {
|
||||
OBJ_RELEASE(endpoints[i]);
|
||||
endpoints[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -400,7 +444,7 @@ static int usnic_del_procs(struct mca_btl_base_module_t *base_module,
|
||||
}
|
||||
}
|
||||
|
||||
/* remove proc from this module */
|
||||
/* remove proc from this module, and decrement its refcount */
|
||||
for (index = 0; index < module->all_procs.size; ++index) {
|
||||
if (opal_pointer_array_get_item(&module->all_procs, index) ==
|
||||
proc) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user