This is what r30852 should have been: Consolidate into a single, outter loop of ibv_create_ah() calls
Follow on to SVN trunk r30850: consolidate the ibv_create_ah() calls into a single loop, MPI_WAITALL-style. That is, call the (effectively non-blocking) ibv_create_ah() for each endpoint. If we get NULL+EAGAIN, it means that the UDP ARP is still ongoing down in the kernel, so just try again later. We put these all into a single loop because it allows us to parallelize the ARP progress in the kernel. cmr=v1.7.5:ticket=trac:4253 This commit was SVN r30879. The following SVN revision numbers were found above: r30850 --> open-mpi/ompi@3641500442 r30852 --> open-mpi/ompi@4e282a3295 The following Trac tickets were found above: Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
родитель
45810f0efb
Коммит
3cbdf33b88
@ -198,6 +198,9 @@ typedef struct ompi_btl_usnic_component_t {
|
||||
bool connectivity_enabled;
|
||||
int connectivity_ack_timeout;
|
||||
int connectivity_num_retries;
|
||||
|
||||
/* ibv_create_ah() (i.e., ARP) timeout */
|
||||
int arp_timeout;
|
||||
} ompi_btl_usnic_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern ompi_btl_usnic_component_t mca_btl_usnic_component;
|
||||
|
@ -267,6 +267,10 @@ int ompi_btl_usnic_component_register(void)
|
||||
USNIC_DFLT_PACK_LAZY_THRESHOLD, &pack_lazy_threshold, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
||||
mca_btl_usnic_component.pack_lazy_threshold = pack_lazy_threshold;
|
||||
|
||||
CHECK(reg_int("arp_timeout", "Timeout, in seconds, for the maximum delay between ARP replies when using the usNIC/UDP transport (ignored when using the usNIC/L2 transport, must be >=1)",
|
||||
10, &mca_btl_usnic_component.arp_timeout,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_6));
|
||||
|
||||
/* Default to bandwidth auto-detection */
|
||||
ompi_btl_usnic_module_template.super.btl_bandwidth = 0;
|
||||
ompi_btl_usnic_module_template.super.btl_latency = 4;
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/prefetch.h"
|
||||
@ -77,6 +78,81 @@ static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
|
||||
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
|
||||
}
|
||||
|
||||
/* The call to ibv_create_ah() may initiate an ARP resolution, and may
|
||||
* therefore take some time to complete. Hence, it will return 1 of 3
|
||||
* things:
|
||||
*
|
||||
* 1. a valid new ah
|
||||
* 2. NULL and errno == EAGAIN (ARP not complete; try again later)
|
||||
* 3. NULL and errno != EAGAIN (fatal error)
|
||||
*
|
||||
* Since ibv_create_ah() is therefore effectively non-blocking, we
|
||||
* gang all the endpoint ah creations here in this loop so that we can
|
||||
* get some parallelization of ARP resolution.
|
||||
*/
|
||||
static int create_ahs(size_t array_len, size_t num_endpoints,
|
||||
struct mca_btl_base_endpoint_t** endpoints,
|
||||
ompi_btl_usnic_module_t *module)
|
||||
{
|
||||
size_t i;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
size_t num_ah_created, last_num_ah_created;
|
||||
time_t ts_last_created;
|
||||
|
||||
/* memset the ah_attr to both silence valgrind warnings (since the
|
||||
attr struct ends up getting written down an fd to the kernel)
|
||||
and actually zero out all the fields that we don't care about /
|
||||
want to be logically false. */
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.port_num = 1;
|
||||
|
||||
ts_last_created = time(NULL);
|
||||
last_num_ah_created = num_ah_created = 0;
|
||||
while (num_ah_created < num_endpoints) {
|
||||
for (i = 0; i < array_len; i++) {
|
||||
if (NULL != endpoints[i] &&
|
||||
NULL == endpoints[i]->endpoint_remote_ah) {
|
||||
ah_attr.grh.dgid = endpoints[i]->endpoint_remote_addr.gid;
|
||||
endpoints[i]->endpoint_remote_ah =
|
||||
ibv_create_ah(module->pd, &ah_attr);
|
||||
if (NULL != endpoints[i]->endpoint_remote_ah) {
|
||||
ts_last_created = time(NULL);
|
||||
++num_ah_created;
|
||||
} else if (EAGAIN != errno) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
"ibv_create_ah()", __FILE__, __LINE__,
|
||||
"Failed to create an address handle");
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Has it been too long since our last AH creation (ARP
|
||||
resolution)? If so, we're probably never going to finish,
|
||||
so just bail. */
|
||||
if (num_ah_created < num_endpoints &&
|
||||
time(NULL) > (ts_last_created + mca_btl_usnic_component.arp_timeout)) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv_create_ah timeout",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
module->if_name,
|
||||
mca_btl_usnic_component.arp_timeout);
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " address handles", num_endpoints);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add procs to this BTL module, receiving endpoint information from
|
||||
* the modex.
|
||||
@ -154,6 +230,22 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic: made %" PRIsize_t " endpoints", count);
|
||||
|
||||
/* Create address handles for all the newly-created endpoints */
|
||||
rc = create_ahs(nprocs, count, endpoints, module);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
if (NULL != endpoints[i]) {
|
||||
OBJ_RELEASE(endpoints[i]);
|
||||
endpoints[i] = NULL;
|
||||
}
|
||||
opal_bitmap_clear_bit(reachable, i);
|
||||
}
|
||||
/* Allow falling through to return OMPI_SUCCESS, even though
|
||||
we couldn't reach everyone (and therefore we gave up). Let
|
||||
the PML know that this module basically isn't reachable,
|
||||
but it's free to continue with other BTL modules. */
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -623,7 +623,6 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
||||
{
|
||||
int err;
|
||||
int modex_index;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
ompi_btl_usnic_endpoint_t *endpoint;
|
||||
|
||||
/* look for matching modex info */
|
||||
@ -654,34 +653,10 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
||||
endpoint->endpoint_next_contig_seq_to_recv - 1;
|
||||
endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);
|
||||
|
||||
/* Create the address handle on this endpoint from the modex info.
|
||||
memset to both silence valgrind warnings (since the attr struct
|
||||
ends up getting written down an fd to the kernel) and actually
|
||||
zero out all the fields that we don't care about / want to be
|
||||
logically false. */
|
||||
memset(&ah_attr, 0, sizeof(ah_attr));
|
||||
ah_attr.is_global = 1;
|
||||
ah_attr.port_num = 1;
|
||||
ah_attr.grh.dgid = endpoint->endpoint_remote_addr.gid;
|
||||
|
||||
while (1) {
|
||||
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
|
||||
if (NULL != endpoint->endpoint_remote_ah) {
|
||||
break;
|
||||
}
|
||||
if (EAGAIN == errno) {
|
||||
continue;
|
||||
}
|
||||
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||
true,
|
||||
ompi_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->port_num,
|
||||
"ibv_create_ah()", __FILE__, __LINE__,
|
||||
"Failed to create an address handle");
|
||||
OBJ_RELEASE(endpoint);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
/* Defer creating the ibv_ah. Since calling ibv_create_ah() may
|
||||
trigger ARP resolution, it's better to batch all the endpoints'
|
||||
calls to ibv_create_ah() together to get some parallelism. */
|
||||
endpoint->endpoint_remote_ah = NULL;
|
||||
|
||||
/* Now claim that modex slot */
|
||||
proc->proc_modex_claimed[modex_index] = true;
|
||||
|
@ -270,3 +270,19 @@ Note that this behavior usually indicates some kind of network
|
||||
misconfiguration. You should verify that UDP traffic with payloads up
|
||||
to the "large message size" listed above can flow between these two
|
||||
servers.
|
||||
#
|
||||
[ibv_create_ah timeout]
|
||||
The usnic BTL failed to create addresses for remote peers within the
|
||||
specified timeout. When using the usNIC/UDP transport, this usually
|
||||
means that ARP requests failed to resolve in time. You may be able to
|
||||
solve the problem by increasing the usnic BTL's ARP timeout. If that
|
||||
doesn't work, you should diagnose why ARP replies are apparently not
|
||||
being delivered in a timely manner.
|
||||
|
||||
The usNIC interface listed below will be ignored. Your MPI
|
||||
application will likely either run with degraded performance and/or
|
||||
abort.
|
||||
|
||||
Server: %s
|
||||
Device: %s:%d (%s)
|
||||
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user