Consolidate into a single, outter loop of ibv_create_ah() calls
Follow on to SVN trunk r30850: consolidate the ibv_create_ah() calls into a single loop, MPI_WAITALL-style. That is, call the (effectively non-blocking) ibv_create_ah() for each endpoint. If we get NULL+EAGAIN, it means that the UDP ARP is still ongoing down in the kernel, so just try again later. We put these all into a single loop because it allows us to parallelize the ARP progress in the kernel. cmr=v1.7.5:ticket=trac:4253 This commit was SVN r30852. The following SVN revision numbers were found above: r30850 --> open-mpi/ompi@3641500442 The following Trac tickets were found above: Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
родитель
52c48b34f0
Коммит
4e282a3295
@ -25,6 +25,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
#include "opal/class/opal_bitmap.h"
|
#include "opal/class/opal_bitmap.h"
|
||||||
#include "opal/prefetch.h"
|
#include "opal/prefetch.h"
|
||||||
@ -76,6 +77,80 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
|
|||||||
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
|
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
|
||||||
|
* therefore take some time to complete. Hence, it will return 1 of 3
|
||||||
|
* things:
|
||||||
|
*
|
||||||
|
* 1. a valid new ah
|
||||||
|
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
|
||||||
|
* 3. NULL and errno != EAGAIN (fatal error)
|
||||||
|
*
|
||||||
|
* Since ibv_create_ah() is therefore effectively non-blocking, we
|
||||||
|
* gang all the endpoint ah creations here in this loop so that we can
|
||||||
|
* get some parallelization of ARP resolution.
|
||||||
|
*/
|
||||||
|
static int create_ahs(size_t num_procs, size_t num_endpoints,
|
||||||
|
struct mca_btl_base_endpoint_t** endpoints,
|
||||||
|
ompi_btl_usnic_module_t *module)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
struct ibv_ah_attr ah_attr;
|
||||||
|
size_t num_ah_created, last_num_ah_created;
|
||||||
|
time_t ts_last_created;
|
||||||
|
|
||||||
|
/* memset the ah_attr to both silence valgrind warnings (since the
|
||||||
|
attr struct ends up getting written down an fd to the kernel)
|
||||||
|
and actually zero out all the fields that we don't care about /
|
||||||
|
want to be logically false. */
|
||||||
|
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||||
|
ah_attr.is_global = 1;
|
||||||
|
ah_attr.port_num = 1;
|
||||||
|
|
||||||
|
ts_last_created = time(NULL);
|
||||||
|
last_num_ah_created = num_ah_created = 0;
|
||||||
|
while (num_ah_created < num_endpoints) {
|
||||||
|
for (i = 0; i < num_procs; i++) {
|
||||||
|
if (NULL != endpoints[i] &&
|
||||||
|
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||||
|
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
|
||||||
|
endpoints[i]->endpoint_remote_ah =
|
||||||
|
ibv_create_ah(module->pd, &ah_attr);
|
||||||
|
if (NULL != endpoints[i]->endpoint_remote_ah) {
|
||||||
|
ts_last_created = time(NULL);
|
||||||
|
++num_ah_created;
|
||||||
|
} else if (EAGAIN != errno) {
|
||||||
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
|
true,
|
||||||
|
ompi_process_info.nodename,
|
||||||
|
ibv_get_device_name(module->device),
|
||||||
|
module->port_num,
|
||||||
|
"ibv_create_ah()", __FILE__, __LINE__,
|
||||||
|
"Failed to create an address handle");
|
||||||
|
/* JMS WTF to do here? */
|
||||||
|
//OBJ_RELEASE(endpoints[i]);
|
||||||
|
//return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Has it been too long since our last AH creation (ARP
|
||||||
|
resolution)? If so, we're probably never going to finish,
|
||||||
|
so just bail. */
|
||||||
|
/* JMS PROBBALY WANT TO MAKE THIS AN MCA PARAM -- YAY I HAZ
|
||||||
|
ALL THE MCA PARAMS!! */
|
||||||
|
if (num_ah_created < num_endpoints &&
|
||||||
|
ts_last_created + 20 > time(NULL)) {
|
||||||
|
opal_output(0, "no ARP. life sux for you.");
|
||||||
|
// JMS make this a real show_help
|
||||||
|
abort(); // For the moment, die quietly, in the snow
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_output_verbose(5, USNIC_OUT,
|
||||||
|
"btl:usnic: made %" PRIsize_t " address handles", num_endpoints);
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add procs to this BTL module, receiving endpoint information from
|
* Add procs to this BTL module, receiving endpoint information from
|
||||||
* the modex.
|
* the modex.
|
||||||
@ -153,6 +228,9 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
|||||||
opal_output_verbose(5, USNIC_OUT,
|
opal_output_verbose(5, USNIC_OUT,
|
||||||
"btl:usnic: made %" PRIsize_t " endpoints", count);
|
"btl:usnic: made %" PRIsize_t " endpoints", count);
|
||||||
|
|
||||||
|
/* Create address handles for all the newly-created endpoints */
|
||||||
|
create_ahs(nprocs, count, endpoints, module);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -623,7 +623,6 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
|||||||
{
|
{
|
||||||
int err;
|
int err;
|
||||||
int modex_index;
|
int modex_index;
|
||||||
struct ibv_ah_attr ah_attr;
|
|
||||||
ompi_btl_usnic_endpoint_t *endpoint;
|
ompi_btl_usnic_endpoint_t *endpoint;
|
||||||
|
|
||||||
/* look for matching modex info */
|
/* look for matching modex info */
|
||||||
@ -654,34 +653,10 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
|||||||
endpoint->endpoint_next_contig_seq_to_recv - 1;
|
endpoint->endpoint_next_contig_seq_to_recv - 1;
|
||||||
endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
|
endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
|
||||||
|
|
||||||
/* Create the address handle on this endpoint from the modex info.
|
/* Defer creating the ibv_ah. Since calling ibv_create_ah() may
|
||||||
memset to both silence valgrind warnings (since the attr struct
|
trigger ARP resolution, it's better to batch all the endpoints'
|
||||||
ends up getting written down an fd to the kernel) and actually
|
calls to ibv_create_ah() together to get some parallelism. */
|
||||||
zero out all the fields that we don't care about / want to be
|
endpoint->endpoint_remote_ah = NULL;
|
||||||
logically false. */
|
|
||||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
|
||||||
ah_attr.is_global = 1;
|
|
||||||
ah_attr.port_num = 1;
|
|
||||||
ah_attr.grh.dgid = endpoint->endpoint_remote_addr.gid;
|
|
||||||
|
|
||||||
while (1) {
|
|
||||||
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
|
|
||||||
if (NULL != endpoint->endpoint_remote_ah) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (EAGAIN == errno) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
|
||||||
true,
|
|
||||||
ompi_process_info.nodename,
|
|
||||||
ibv_get_device_name(module->device),
|
|
||||||
module->port_num,
|
|
||||||
"ibv_create_ah()", __FILE__, __LINE__,
|
|
||||||
"Failed to create an address handle");
|
|
||||||
OBJ_RELEASE(endpoint);
|
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now claim that modex slot */
|
/* Now claim that modex slot */
|
||||||
proc->proc_modex_claimed[modex_index] = true;
|
proc->proc_modex_claimed[modex_index] = true;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user