Consolidate into a single, outter loop of ibv_create_ah() calls
Follow on to SVN trunk r30850: consolidate the ibv_create_ah() calls into a single loop, MPI_WAITALL-style. That is, call the (effectively non-blocking) ibv_create_ah() for each endpoint. If we get NULL+EAGAIN, it means that the UDP ARP is still ongoing down in the kernel, so just try again later. We put these all into a single loop because it allows us to parallelize the ARP progress in the kernel. cmr=v1.7.5:ticket=trac:4253 This commit was SVN r30852. The following SVN revision numbers were found above: r30850 --> open-mpi/ompi@3641500442 The following Trac tickets were found above: Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
родитель
52c48b34f0
Коммит
4e282a3295
@ -25,6 +25,7 @@
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/prefetch.h"
|
||||
@ -76,6 +77,80 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
|
||||
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
|
||||
}
|
||||
|
||||
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
|
||||
* therefore take some time to complete. Hence, it will return 1 of 3
|
||||
* things:
|
||||
*
|
||||
* 1. a valid new ah
|
||||
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
|
||||
* 3. NULL and errno != EAGAIN (fatal error)
|
||||
*
|
||||
* Since ibv_create_ah() is therefore effectively non-blocking, we
|
||||
* gang all the endpoint ah creations here in this loop so that we can
|
||||
* get some parallelization of ARP resolution.
|
||||
*/
|
||||
static int create_ahs(size_t num_procs, size_t num_endpoints,
|
||||
struct mca_btl_base_endpoint_t** endpoints,
|
||||
ompi_btl_usnic_module_t *module)
|
||||
{
|
||||
size_t i;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
size_t num_ah_created, last_num_ah_created;
|
||||
time_t ts_last_created;
|
||||
|
||||
/* memset the ah_attr to both silence valgrind warnings (since the
|
||||
attr struct ends up getting written down an fd to the kernel)
|
||||
and actually zero out all the fields that we don't care about /
|
||||
want to be logically false. */
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.port_num = 1;
|
||||
|
||||
ts_last_created = time(NULL);
|
||||
last_num_ah_created = num_ah_created = 0;
|
||||
while (num_ah_created < num_endpoints) {
|
||||
for (i = 0; i < num_procs; i++) {
|
||||
if (NULL != endpoints[i] &&
|
||||
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
|
||||
endpoints[i]->endpoint_remote_ah =
|
||||
ibv_create_ah(module->pd, &ah_attr);
|
||||
if (NULL != endpoints[i]->endpoint_remote_ah) {
|
||||
ts_last_created = time(NULL);
|
||||
++num_ah_created;
|
||||
} else if (EAGAIN != errno) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
"ibv_create_ah()", __FILE__, __LINE__,
|
||||
"Failed to create an address handle");
|
||||
/* JMS WTF to do here? */
|
||||
//OBJ_RELEASE(endpoints[i]);
|
||||
//return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Has it been too long since our last AH creation (ARP
|
||||
resolution)? If so, we're probably never going to finish,
|
||||
so just bail. */
|
||||
/* JMS PROBBALY WANT TO MAKE THIS AN MCA PARAM -- YAY I HAZ
|
||||
ALL THE MCA PARAMS!! */
|
||||
if (num_ah_created < num_endpoints &&
|
||||
ts_last_created + 20 > time(NULL)) {
|
||||
opal_output(0, "no ARP. life sux for you.");
|
||||
// JMS make this a real show_help
|
||||
abort(); // For the moment, die quietly, in the snow
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " address handles", num_endpoints);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add procs to this BTL module, receiving endpoint information from
|
||||
* the modex.
|
||||
@ -153,6 +228,9 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " endpoints", count);
|
||||
|
||||
/* Create address handles for all the newly-created endpoints */
|
||||
create_ahs(nprocs, count, endpoints, module);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -623,7 +623,6 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
||||
{
|
||||
int err;
|
||||
int modex_index;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
ompi_btl_usnic_endpoint_t *endpoint;
|
||||
|
||||
/* look for matching modex info */
|
||||
@ -654,34 +653,10 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
||||
endpoint->endpoint_next_contig_seq_to_recv - 1;
|
||||
endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
|
||||
|
||||
/* Create the address handle on this endpoint from the modex info.
|
||||
memset to both silence valgrind warnings (since the attr struct
|
||||
ends up getting written down an fd to the kernel) and actually
|
||||
zero out all the fields that we don't care about / want to be
|
||||
logically false. */
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.port_num = 1;
|
||||
ah_attr.grh.dgid = endpoint->endpoint_remote_addr.gid;
|
||||
|
||||
while (1) {
|
||||
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
|
||||
if (NULL != endpoint->endpoint_remote_ah) {
|
||||
break;
|
||||
}
|
||||
if (EAGAIN == errno) {
|
||||
continue;
|
||||
}
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
"ibv_create_ah()", __FILE__, __LINE__,
|
||||
"Failed to create an address handle");
|
||||
OBJ_RELEASE(endpoint);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* Defer creating the ibv_ah. Since calling ibv_create_ah() may
|
||||
trigger ARP resolution, it's better to batch all the endpoints'
|
||||
calls to ibv_create_ah() together to get some parallelism. */
|
||||
endpoint->endpoint_remote_ah = NULL;
|
||||
|
||||
/* Now claim that modex slot */
|
||||
proc->proc_modex_claimed[modex_index] = true;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user