1
1

Consolidate into a single, outter loop of ibv_create_ah() calls

Follow on to SVN trunk r30850: consolidate the ibv_create_ah() calls
into a single loop, MPI_WAITALL-style.  That is, call the (effectively
non-blocking) ibv_create_ah() for each endpoint.  If we get
NULL+EAGAIN, it means that the UDP ARP is still ongoing down in the
kernel, so just try again later.  We put these all into a single loop
because it allows us to parallelize the ARP progress in the kernel.

cmr=v1.7.5:ticket=trac:4253

This commit was SVN r30852.

The following SVN revision numbers were found above:
  r30850 --> open-mpi/ompi@3641500442

The following Trac tickets were found above:
  Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
Jeff Squyres 2014-02-26 11:02:12 +00:00
родитель 52c48b34f0
Коммит 4e282a3295
2 изменённых файлов: 82 добавлений и 29 удалений

Просмотреть файл

@ -25,6 +25,7 @@
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include "opal/class/opal_bitmap.h"
#include "opal/prefetch.h"
@ -76,6 +77,80 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
}
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
* therefore take some time to complete. Hence, it will return 1 of 3
* things:
*
* 1. a valid new ah
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
* 3. NULL and errno != EAGAIN (fatal error)
*
* Since ibv_create_ah() is therefore effectively non-blocking, we
* gang all the endpoint ah creations here in this loop so that we can
* get some parallelization of ARP resolution.
*/
static int create_ahs(size_t num_procs, size_t num_endpoints,
struct mca_btl_base_endpoint_t** endpoints,
ompi_btl_usnic_module_t *module)
{
size_t i;
struct ibv_ah_attr ah_attr;
size_t num_ah_created, last_num_ah_created;
time_t ts_last_created;
/* memset the ah_attr to both silence valgrind warnings (since the
attr struct ends up getting written down an fd to the kernel)
and actually zero out all the fields that we don't care about /
want to be logically false. */
memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.is_global = 1;
ah_attr.port_num = 1;
ts_last_created = time(NULL);
last_num_ah_created = num_ah_created = 0;
while (num_ah_created < num_endpoints) {
for (i = 0; i < num_procs; i++) {
if (NULL != endpoints[i] &&
NULL == endpoints[i]->endpoint_remote_ah) {
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
endpoints[i]->endpoint_remote_ah =
ibv_create_ah(module->pd, &ah_attr);
if (NULL != endpoints[i]->endpoint_remote_ah) {
ts_last_created = time(NULL);
++num_ah_created;
} else if (EAGAIN != errno) {
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
"ibv_create_ah()", __FILE__, __LINE__,
"Failed to create an address handle");
/* JMS WTF to do here? */
//OBJ_RELEASE(endpoints[i]);
//return OMPI_ERR_OUT_OF_RESOURCE;
}
}
}
/* Has it been too long since our last AH creation (ARP
resolution)? If so, we're probably never going to finish,
so just bail. */
/* JMS PROBBALY WANT TO MAKE THIS AN MCA PARAM -- YAY I HAZ
ALL THE MCA PARAMS!! */
if (num_ah_created < num_endpoints &&
ts_last_created + 20 > time(NULL)) {
opal_output(0, "no ARP. life sux for you.");
// JMS make this a real show_help
abort(); // For the moment, die quietly, in the snow
}
}
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: made %" PRIsize_t " address handles", num_endpoints);
return OMPI_SUCCESS;
}
/*
* Add procs to this BTL module, receiving endpoint information from
* the modex.
@ -153,6 +228,9 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: made %" PRIsize_t " endpoints", count);
/* Create address handles for all the newly-created endpoints */
create_ahs(nprocs, count, endpoints, module);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -623,7 +623,6 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
{
int err;
int modex_index;
struct ibv_ah_attr ah_attr;
ompi_btl_usnic_endpoint_t *endpoint;
/* look for matching modex info */
@ -654,34 +653,10 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
endpoint->endpoint_next_contig_seq_to_recv - 1;
endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
/* Create the address handle on this endpoint from the modex info.
memset to both silence valgrind warnings (since the attr struct
ends up getting written down an fd to the kernel) and actually
zero out all the fields that we don't care about / want to be
logically false. */
memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.is_global = 1;
ah_attr.port_num = 1;
ah_attr.grh.dgid = endpoint->endpoint_remote_addr.gid;
while (1) {
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
if (NULL != endpoint->endpoint_remote_ah) {
break;
}
if (EAGAIN == errno) {
continue;
}
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
true,
ompi_process_info.nodename,
ibv_get_device_name(module->device),
module->port_num,
"ibv_create_ah()", __FILE__, __LINE__,
"Failed to create an address handle");
OBJ_RELEASE(endpoint);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Defer creating the ibv_ah. Since calling ibv_create_ah() may
trigger ARP resolution, it's better to batch all the endpoints'
calls to ibv_create_ah() together to get some parallelism. */
endpoint->endpoint_remote_ah = NULL;
/* Now claim that modex slot */
proc->proc_modex_claimed[modex_index] = true;